Take all the input files, attach their unique_id, find the ones with replicates, then look at the median of 2*SD (~95% spread around measurement mean) for all measurements to decided on an approriate significance threshold
file_vector = str_remove(list.files(path = "Input", pattern="*.csv", recursive = T), ".csv")
error_set = lapply(paste0("Input/", list.files(path = "Input", pattern="*.csv", recursive = T)), read_csv, skip = 5, col_names = F, col_select = 1:2, show_col_types=F)
for(i in 1:length(error_set)){
error_set[[i]][,3] = file_vector[i]
}
error_set = do.call("rbind", error_set)
colnames(error_set) = c("Genotype", "Phenotype", "unique_id")
error_rates = error_set[which(duplicated(error_set %>%
unite(unique_geno, c(1,3), sep = "_", remove = F) %>%
pull(unique_geno))),] %>%
group_by(Genotype) %>%
summarise(pheno_mean = mean(Phenotype),
pheno_sd = sd(Phenotype))
## % of SDs for measurements that are less than 1.5-fold threshold
sum(error_rates$pheno_sd*2 <= log10(1.5)) / length(error_rates$pheno_sd)
## [1] 0.5170068
median(error_rates$pheno_sd*2)
## [1] 0.1705538
summary(error_rates$pheno_sd*2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.08511 0.17055 0.71405 0.74638 3.73452
error_rates %>%
ggplot(aes(x = pheno_sd*2)) +
geom_histogram(bins = 50) +
geom_vline(xintercept = log10(1.5), lty = 2) +
theme_classic() +
theme(text = element_text(size=18), axis.text = element_text(size = 16, color = "black"),
axis.title.y = element_blank(),
axis.title.x = element_blank(),
legend.position = "none")
## Fold-Change Data
Here we look at the fold-change data used in the input files. This allows us to see positive and negative functional effects across the landscapes.
fit_land = do.call("rbind", lapply(paste0("Output/", list.files(path = "Output", pattern="observed_values.csv", recursive = T)), read_csv, skip = 1, show_col_types=F, col_names = c("id", "effect", "muts", "junk1", "junk2", "enz")))
## Make fit_land have IDs by finding where muts are 0, i.e. start of new dataset, and multiplying
## Those values by the folder names to generate ID vector
fit_land_spots = c(which(fit_land$muts == 0), (dim(fit_land)[1] + 1)) # Add end of dataset
fit_land_counts = c()
for(i in 1:(length(fit_land_spots) - 1)) {
if(i == 1) {
fit_land_counts = c(fit_land_counts, fit_land_spots[i + 1] - 1)
} else {
fit_land_counts = c(fit_land_counts, fit_land_spots[i + 1] - fit_land_spots[i])
}
}
## This assumes folders are read in the order that they are listed
fit_land_ids = rep(str_remove_all(list.files(path = "Output", pattern="observed_values.csv", recursive = T), "/observed_values.csv"), fit_land_counts)
fit_land$unique_id = fit_land_ids
fit_land = fit_land %>% filter(muts != 0)
The data shows the following distribution
c(sum(fit_land$effect < log10(1/1.5)), sum(fit_land$effect >= log10(1/1.5) & fit_land$effect <= log10(1.5)), sum(fit_land$effect > log10(1.5)))
## [1] 360 292 807
And these percentages
c(sum(fit_land$effect < log10(1/1.5)), sum(fit_land$effect >= log10(1/1.5) & fit_land$effect <= log10(1.5)), sum(fit_land$effect > log10(1.5))) / sum(c(sum(fit_land$effect < log10(1/1.5)), sum(fit_land$effect >= log10(1/1.5) & fit_land$effect <= log10(1.5)), sum(fit_land$effect > log10(1.5))))
## [1] 0.2467443 0.2001371 0.5531186
fit_land_plot = fit_land %>%
ggplot(aes(x = effect)) +
geom_histogram(bins = 100, alpha = 0.8, color = "black", lwd = 0.1) +
geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(1/1.5), lty = 2, lwd = 0.2) +
theme_classic() +
labs(x = expression(italic("F")),
y = "Genotype Counts") +
scale_y_continuous(breaks = c(0, 50, 100, 150, 200, 250)) +
theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"))
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
fit_land_plot
#ggsave("supp_fig_1.svg", plot = fit_land_plot, width = 180/2, height = 247/4, dpi = 300, units = "mm")
First we import the ratio datasets, which contains ratio values between predicted function based on the additive model vs observed function. These ratio values serve as a general metric for “epistasis” as defined by the additive model.
all_ratios = lapply(paste0("Output/", list.files(path = "Output", pattern="ratio_export.csv", recursive = T)), read_csv, skip = 1, show_col_types=F, col_names = c("id", "effect", "muts", "cv", "colors"))
all_ratios_id = str_remove_all(list.files(path = "Output", pattern="ratio_export.csv", recursive = T), "/ratio_export.csv")
temp_col_length = length(all_ratios[[1]]) + 1
for(i in 1:length(all_ratios)) {
all_ratios[[i]][, temp_col_length] = rep(all_ratios_id[i], dim(all_ratios[[i]])[1]) ## Add partial_id as a column to each ratio df in list
}
rm(temp_col_length)
## Make into a tibble
all_ratios = do.call("rbind", all_ratios)
colnames(all_ratios) = c("id", "effect", "muts", "cv", "colors", "partial_id")
Filtering ratio data to only include genotypes with 2+ mutations. Outputting length of vector.
all_ratios = all_ratios %>% filter(muts > 1)
length(all_ratios$effect) # examined muts
## [1] 1245
Number and % of synergystic genotypes (i.e. ratio > 1.5-fold)
sum(all_ratios$effect > 1.5) # synergystic
## [1] 502
paste0("Synergystic ", round(sum(all_ratios$effect > 1.5)/length(all_ratios$effect)*100,2), "%", collapse = "") # synergystic %
## [1] "Synergystic 40.32%"
Number and % of neutral genotypes (i.e. ratio > 1.5-fold)
sum(all_ratios$effect >= 1/1.5 & all_ratios$effect <= 1.5) # antagonistic
## [1] 379
paste0("Neutral ", round(sum(all_ratios$effect >= 1/1.5 & all_ratios$effect <= 1.5)/length(all_ratios$effect)*100, 2), "%", collapse = "") # antagonistic %
## [1] "Neutral 30.44%"
Number and % of antagonistic genotypes (i.e. ratio > 1.5-fold)
sum(all_ratios$effect < 1/1.5) # antagonistic
## [1] 364
paste0("Antagonistic ", round(sum(all_ratios$effect < 1/1.5)/length(all_ratios$effect)*100, 2), "%", collapse = "") # antagonistic %
## [1] "Antagonistic 29.24%"
Shows distribution of positive and negative epistasis across the landscapes. The significance threshold is 1.5-fold
all_ratios_plot = all_ratios %>%
ggplot(aes(x = log10(effect))) +
geom_histogram(bins = 100, alpha = 0.8, color = "black", lwd = 0.1) +
geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(1/1.5), lty = 2, lwd = 0.2) +
theme_classic() +
scale_fill_manual(values = c("grey")) +
labs(x = expression("Observed" *italic(" F")*" / Predicted"*italic(" F")),
y = "Genotype Counts") +
theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")
all_ratios_plot
#ggsave("supp_fig_2.svg", plot = all_ratios_plot, width = 180/2, height = 247/4, dpi = 300, units = "mm")
Also the ratios table. Here we create bins from -Inf to log10(1/1.5), i.e. -0.176, then the neutral range between log10(1/1.5) to log10(1.5), i.e. 0.176, then the positive range from 0.176 to Inf.
all_ratios %>%
mutate(sign = cut(log10(effect), breaks = c(-Inf, log10(1/1.5), log10(1.5), Inf))) %>%
count(sign) %>%
mutate(percent = round(n/sum(n) * 100, 2))
## # A tibble: 3 × 3
## sign n percent
## <fct> <int> <dbl>
## 1 (-Inf,-0.176] 364 29.2
## 2 (-0.176,0.176] 379 30.4
## 3 (0.176, Inf] 502 40.3
We can do the same thing for individual orders: 2nd, 3rd, and 4th order
## 2nd step only
all_ratios_2 = all_ratios %>% filter(muts == 2)
length(all_ratios_2$effect) # examined muts
## [1] 419
sum(all_ratios_2$effect > 1.5) # synergystic
## [1] 135
paste0("Synergystic ", round(sum(all_ratios_2$effect > 1.5)/length(all_ratios_2$effect)*100, 2), "%", collapse = "") # synergystic %
## [1] "Synergystic 32.22%"
sum(all_ratios_2$effect >= 1/1.5 & all_ratios_2$effect <= 1.5) # antagonistic
## [1] 160
paste0("Neutral ", round(sum(all_ratios_2$effect >= 1/1.5 & all_ratios_2$effect <= 1.5)/length(all_ratios_2$effect)*100, 2), "%", collapse = "") # antagonistic %
## [1] "Neutral 38.19%"
sum(all_ratios_2$effect < 1/1.5) # antagonistic
## [1] 124
paste0("Antagonistic ", round(sum(all_ratios_2$effect < 1/1.5)/length(all_ratios_2$effect)*100, 2), "%", collapse = "") # antagonistic %
## [1] "Antagonistic 29.59%"
## 3rd step only
all_ratios_3 = all_ratios %>% filter(muts == 3)
length(all_ratios_3$effect) # examined muts
## [1] 438
sum(all_ratios_3$effect > 1.5) # synergystic
## [1] 160
paste0("Synergystic ", round(sum(all_ratios_3$effect > 1.5)/length(all_ratios_3$effect)*100, 2), "%", collapse = "") # synergystic %
## [1] "Synergystic 36.53%"
sum(all_ratios_3$effect >= 1/1.5 & all_ratios_3$effect <= 1.5) # antagonistic
## [1] 120
paste0("Neutral ", round(sum(all_ratios_3$effect >= 1/1.5 & all_ratios_3$effect <= 1.5)/length(all_ratios_3$effect)*100, 2), "%", collapse = "") # antagonistic %
## [1] "Neutral 27.4%"
sum(all_ratios_3$effect < 1/1.5) # antagonistic
## [1] 158
paste0("Antagonistic ", round(sum(all_ratios_3$effect < 1/1.5)/length(all_ratios_3$effect)*100, 2), "%", collapse = "") # antagonistic %
## [1] "Antagonistic 36.07%"
## 4th step only
all_ratios_4 = all_ratios %>% filter(muts == 4)
length(all_ratios_4$effect) # examined muts
## [1] 267
sum(all_ratios_4$effect > 1.5) # synergystic
## [1] 127
paste0("Synergystic ", round(sum(all_ratios_4$effect > 1.5)/length(all_ratios_4$effect)*100, 2), "%", collapse = "") # synergystic %
## [1] "Synergystic 47.57%"
sum(all_ratios_4$effect >= 1/1.5 & all_ratios_4$effect <= 1.5) # antagonistic
## [1] 72
paste0("Neutral ", round(sum(all_ratios_4$effect >= 1/1.5 & all_ratios_4$effect <= 1.5)/length(all_ratios_4$effect)*100, 2), "%", collapse = "") # antagonistic %
## [1] "Neutral 26.97%"
sum(all_ratios_4$effect < 1/1.5) # antagonistic
## [1] 68
paste0("Antagonistic ", round(sum(all_ratios_4$effect < 1/1.5)/length(all_ratios_4$effect)*100, 2), "%", collapse = "") # antagonistic %
## [1] "Antagonistic 25.47%"
Here we import the functional contribution of single positions (not combinations), which resembles first order analysis across all datasets.
## Reads all csvs in collated folder and combines them
d = do.call("rbind", lapply(paste0("Output/", list.files(path = "Output", pattern="*2d_box.csv", recursive = T)), read_csv, show_col_types=F, col_names = c("positions", "identity", "mut", "effects", "enzyme", "type", "cond")))
d1 = d %>% filter_all(any_vars(!is.na(.))) # removes rows with all NA before making unique ID
d1 = d1 %>%
unite("unique_id", c(positions, enzyme, type, cond), remove = F)
d1 = d1 %>%
unite("partial_id", c(enzyme, type, cond), remove = F)
Spread of functional contributions of the positions. This shows whether introducing a mutation in a given position impacts the function positively or negatively (or neutral) across all backgrounds
d1_plot = d1 %>%
ggplot(aes(x = effects)) +
geom_histogram(bins = 100, alpha = 0.8, color = "black", lwd = 0.1, fill = "#edae49") +
geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(1/1.5), lty = 2, lwd = 0.2) +
theme_classic() +
labs(x = expression(Delta*italic("F")),
y = "Genotype Count") +
theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")
d1_plot
#ggsave("fig_1A.svg", plot = d1_plot, width = 180/2, height = 247/4, dpi = 300, units = "mm")
What are the percentages of each in the categories?
d1 %>%
count(effects < log10(1/1.5),
effects > log10(1.5)) %>%
mutate(prop = n/sum(n)*100)
## # A tibble: 3 × 4
## `effects < log10(1/1.5)` `effects > log10(1.5)` n prop
## <lgl> <lgl> <int> <dbl>
## 1 FALSE FALSE 1447 35.6
## 2 FALSE TRUE 1760 43.3
## 3 TRUE FALSE 857 21.1
Density plots for each trajectory by enzymes
d1_plot_enz = d1 %>%
ggplot(aes(x = effects, fill = enzyme)) +
geom_density(alpha = 0.8, color = "black", lwd = 0.1) +
geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(1/1.5), lty = 2, lwd = 0.2) +
theme_classic() +
labs(x = expression(Delta*italic("F")),
y = "Genotype Count") +
#scale_y_continuous(breaks = c(0, 50, 100, 150, 200, 250)) +
theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"))
d1_plot_enz
### For WT-normalized data
Load the same kind of data as d1 except that has been normalized do the \(\Delta\)F in the WT background
wt_rel_df = do.call("rbind", lapply(paste0("Output/", list.files(path = "Output", pattern="*wt_rel.csv", recursive = T)), read_csv, show_col_types=F, col_names = c("positions", "identity", "mut", "effects", "enzyme", "type", "cond")))
wt_rel_df = wt_rel_df %>% filter_all(any_vars(!is.na(.))) # removes rows with all NA before making unique ID
wt_rel_df = wt_rel_df %>%
unite("unique_id", c(positions, enzyme, type, cond), remove = F) %>%
unite("partial_id", c(enzyme, type, cond), remove = F)
Then we plot it
wt_rel_plot = wt_rel_df %>%
filter(mut > 1) %>% # filter out first mutations which are by definition 0 delta F
ggplot(aes(x = effects)) +
geom_histogram(bins = 100, alpha = 0.8, color = "black", lwd = 0.1, fill = "#edae49") +
geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(1/1.5), lty = 2, lwd = 0.2) +
theme_classic() +
labs(x = expression(Delta*italic("F ")*"(Single Mutant Normalized)"),
y = "Genotype Count") +
theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")
wt_rel_plot
#ggsave("fig_1A.svg", plot = d1_plot, width = 180/2, height = 247/4, dpi = 300, units = "mm")
And again for density plots for each enzyme
wt_rel_plot_enz = wt_rel_df %>%
mutate(Enzyme = enzyme) %>%
filter(mut > 1) %>%
ggplot(aes(x = effects, fill = Enzyme)) +
geom_density(alpha = 0.8, color = "black", lwd = 0.1) +
geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(1/1.5), lty = 2, lwd = 0.2) +
theme_classic() +
labs(x = expression(Delta*italic("F ")*"(Single Mutant Normalized)"),
y = "Density") +
scale_x_continuous(limits = c(-2.5, 2.5)) + # Same data omitted for better resolution
theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"))
wt_rel_plot_enz
## Warning: Removed 73 rows containing non-finite values (`stat_density()`).
Given this spread which percentage is significantly (using log10 1.5-fold cutoff) negative, neutral, and positive in that order
wt_rel_df %>%
filter(mut > 1) %>%
mutate(sign = cut(effects, breaks = c(-Inf, log10(1/1.5), log10(1.5), Inf))) %>%
count(sign) %>%
mutate(percent = round(n/sum(n) * 100, 2))
## # A tibble: 3 × 3
## sign n percent
## <fct> <int> <dbl>
## 1 (-Inf,-0.176] 942 24.5
## 2 (-0.176,0.176] 1438 37.4
## 3 (0.176, Inf] 1470 38.2
Looks at 2*SD of each position/combination across all orders to determine functional heterogeneity.
This gives us a quick look at the general spread of heterogenity values and their mean.
idio_d = do.call("rbind", lapply(paste0("Output/", list.files(path = "Output", pattern="idio_df", recursive = T)), read_csv, show_col_types=F))
traj_col = sapply(str_split(list.files(path = "Output", pattern="idio_df", recursive = T), "/"),"[[",1)
traj_col_len = sapply(lapply(paste0("Output/", list.files(path = "Output", pattern="idio_df", recursive = T)), read_csv, show_col_types=F), dim)[1,]
traj_col_full = c()
for(i in 1:length(traj_col)) {
traj_col_full = c(traj_col_full, rep(traj_col[i], traj_col_len[i]))
}
idio_d$traj = traj_col_full
idio_d_order_1 = idio_d %>%
filter(mutations == "Order 1") %>%
ggplot(aes(x = idiosync)) +
geom_histogram(binwidth = 0.08, color = "black", lwd = 0.1, fill = "#edae49") +
geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
labs(x = expression("2"*sigma),
y = "Position Count") +
theme_classic() +
theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")
idio_d_order_1
#ggsave("fig_1B.svg", idio_d_order_1, width = 180/2, height = 247/4, dpi = 300, units = "mm")
First order positions which have a heterogeneity index > log10 1.5-fold, 2-fold, 5-fold, and 10-fold for adaptive trajectories
idio_d %>%
filter(mutations == "Order 1") %>%
mutate(idiosync_1.5 = idiosync > log10(1.5),
idiosync_2 = idiosync > log10(2),
idiosync_5 = idiosync > log10(5),
idiosync_10 = idiosync > log10(10)) %>%
group_by(mutations) %>%
summarise(idiosync_sig_1.5 = sum(idiosync_1.5),
idiosync_sig_2 = sum(idiosync_2),
idiosync_sig_5 = sum(idiosync_5),
idiosync_sig_10 = sum(idiosync_10),
idiosync_total = length(idiosync),
idiosync_1.5 = sum(idiosync_1.5)/length(idiosync_1.5) * 100,
idiosync_2 = sum(idiosync_2)/length(idiosync_2) * 100,
idiosync_5 = sum(idiosync_5)/length(idiosync_5) * 100,
idiosync_10 = sum(idiosync_10)/length(idiosync_10) * 100
) %>%
knitr::kable()
| mutations | idiosync_sig_1.5 | idiosync_sig_2 | idiosync_sig_5 | idiosync_sig_10 | idiosync_total | idiosync_1.5 | idiosync_2 | idiosync_5 | idiosync_10 |
|---|---|---|---|---|---|---|---|---|---|
| Order 1 | 211 | 205 | 150 | 114 | 214 | 98.59813 | 95.79439 | 70.09346 | 53.27103 |
Histogram to show heterogenity at Orders 2, 3, and 4 with the log10 1.5-fold significance threshold
idio_d_plot_2 = idio_d %>%
filter(mutations == "Order 2") %>%
ggplot(aes(x = idiosync)) +
geom_histogram(fill = "#edae49", color = "black", lwd = 0.1) +
geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
labs(x = expression("log"[10]*" 2"*sigma),
y = "Combination Count") +
theme_classic() +
theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")
idio_d_plot_2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#ggsave("fig_2A_1.svg", idio_d_plot_2, width = 180/3, height = 247/4, dpi = 300, units = "mm")
idio_d_plot_3 = idio_d %>%
filter(mutations == "Order 3") %>%
ggplot(aes(x = idiosync)) +
geom_histogram(fill = "#edae49", color = "black", lwd = 0.1) +
geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
labs(x = expression("log"[10]*" 2"*sigma),
y = "Combination Count") +
theme_classic() +
theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")
idio_d_plot_3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#ggsave("fig_2A_2.svg", idio_d_plot_3, width = 180/3, height = 247/4, dpi = 300, units = "mm")
idio_d_plot_4 = idio_d %>%
filter(mutations == "Order 4") %>%
ggplot(aes(x = idiosync)) +
geom_histogram(fill = "#edae49", color = "black", lwd = 0.1) +
geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
labs(x = expression("log"[10]*" 2"*sigma),
y = "Combination Count") +
theme_classic() +
theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")
idio_d_plot_4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#ggsave("fig_2A_3.svg", idio_d_plot_4, width = 180/3, height = 247/4, dpi = 300, units = "mm")
idio_d_multi_plot = idio_d %>%
filter(mutations != "Order 1" & mutations != "Order 5" & mutations != "Order 6") %>%
ggplot(aes(x = idiosync)) +
geom_histogram(color = "black", lwd = 0.1, fill = "#edae49") +
geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
labs(x = expression("2"*sigma),
y = "Combination Count") +
facet_grid(~ mutations) +
scale_x_continuous(limits = c(0,5)) +
theme_classic() +
theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")
idio_d_multi_plot # Data removed
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 51 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 6 rows containing missing values (`geom_bar()`).
#ggsave("fig_2A.svg", idio_d_multi_plot, width = 180, height = 247/4, dpi = 300, units = "mm")
What percentage of positions remain significantly heterogeneous (1.5-fold) at these orders?
idio_d %>%
filter(mutations == "Order 2" | mutations == "Order 3" | mutations == "Order 4") %>%
mutate(idiosync = idiosync > log10(1.5)) %>%
group_by(mutations) %>%
summarise(idiosync_sig = sum(idiosync),
idiosync_total = length(idiosync),
idiosync = sum(idiosync)/length(idiosync) * 100
)
## # A tibble: 3 × 4
## mutations idiosync_sig idiosync_total idiosync
## <chr> <int> <int> <dbl>
## 1 Order 2 413 419 98.6
## 2 Order 3 420 438 95.9
## 3 Order 4 240 245 98.0
What percentage of positions remain significantly heterogeneous (1.5-, 2-, 5-, and 10-fold) at all orders?
idio_d %>%
filter(mutations == "Order 1" | mutations == "Order 2" | mutations == "Order 3" | mutations == "Order 4") %>%
mutate(idiosync_1.5 = idiosync > log10(1.5),
idiosync_2 = idiosync > log10(2),
idiosync_5 = idiosync > log10(5),
idiosync_10 = idiosync > log10(10)) %>%
group_by(mutations) %>%
summarise(idiosync_sig_1.5 = sum(idiosync_1.5),
idiosync_sig_2 = sum(idiosync_2),
idiosync_sig_5 = sum(idiosync_5),
idiosync_sig_10 = sum(idiosync_10),
idiosync_total = length(idiosync),
idiosync_1.5 = sum(idiosync_1.5)/length(idiosync_1.5) * 100,
idiosync_2 = sum(idiosync_2)/length(idiosync_2) * 100,
idiosync_5 = sum(idiosync_5)/length(idiosync_5) * 100,
idiosync_10 = sum(idiosync_10)/length(idiosync_10) * 100
) %>%
knitr::kable()
| mutations | idiosync_sig_1.5 | idiosync_sig_2 | idiosync_sig_5 | idiosync_sig_10 | idiosync_total | idiosync_1.5 | idiosync_2 | idiosync_5 | idiosync_10 |
|---|---|---|---|---|---|---|---|---|---|
| Order 1 | 211 | 205 | 150 | 114 | 214 | 98.59813 | 95.79439 | 70.09346 | 53.27103 |
| Order 2 | 413 | 387 | 253 | 172 | 419 | 98.56802 | 92.36277 | 60.38186 | 41.05012 |
| Order 3 | 420 | 374 | 199 | 136 | 438 | 95.89041 | 85.38813 | 45.43379 | 31.05023 |
| Order 4 | 240 | 204 | 122 | 61 | 245 | 97.95918 | 83.26531 | 49.79592 | 24.89796 |
Collects all \(\Delta\)Function data for all orders
higher_order_list = list.files(path = "Output", pattern="higher_box", recursive = T)
higher_df = data.frame()
for(higher_file in 1:length(higher_order_list)){
Condition = str_split(str_split(higher_order_list[higher_file], "_")[[1]][3], "/")[[1]][1]
Measurement = str_split(higher_order_list[higher_file], "_")[[1]][2]
Enzyme = str_split(higher_order_list[higher_file], "_")[[1]][1]
appending_file = read_csv(paste0("Output/", higher_order_list[higher_file]), show_col_types=F)
appending_file$Condition = Condition
appending_file$Measurement = Measurement
appending_file$Enzyme = Enzyme
higher_df = bind_rows(higher_df, appending_file)
}
higher_df = higher_df %>%
unite("unique_id", c(pos, Condition, Measurement, Enzyme), remove = F) %>%
unite("partial_id", c(Enzyme, Measurement, Condition), remove = F)
higher_all_sign_check = higher_df %>%
filter(str_count(genotype, "x")/str_length(genotype) != 1) %>% # Remove interactions with no other backgrounds
mutate(mutations = factor(paste("Order", mutations, sep = " "))) %>%
group_by(unique_id, mutations) %>%
summarise(min_effect = min(avg),
max_effect = max(avg))
Assigning positive, neutral, negative, negative-neutral, positive-neutral, and negative-positive ‘types’ based on log10 1.5-fold threshold to every Order
threshold = 1.5
types = c()
for(each in 1:dim(higher_all_sign_check)[1]){
if(higher_all_sign_check$min_effect[each] < log10(1/threshold)){
# Negative branch
if(higher_all_sign_check$max_effect[each] < log10(1/threshold)) {
# Full negative
types = c(types, "Negative")
} else if(higher_all_sign_check$max_effect[each] <= log10(threshold)) {
# Negative Neutral
types = c(types, "Neutral Negative")
} else {
# Negative Positive
types = c(types, "Positive Negative")
}
} else {
# Neutral or Positive
if(higher_all_sign_check$min_effect[each] > log10(threshold)) {
# Full positive
types = c(types, "Positive")
} else if(higher_all_sign_check$max_effect[each] > log10(threshold)) {
# Neutral Positive
types = c(types, "Neutral Positive")
} else {
# Neutral
types = c(types, "Neutral")
}
}
}
higher_all_sign_check$type = factor(types)
All outputs show position/combination numbers in each category, followed by the total sum of all probed positions/combinations at that order, and finally percentages of each category
order_checker = higher_all_sign_check %>%
filter(mutations == "Order 1") %>%
pull(type)
order_checker_levels = c("Negative", "Neutral", "Positive", "Neutral Negative", "Neutral Positive", "Positive Negative")
summary(order_checker)
## Negative Neutral Neutral Negative Neutral Positive
## 4 2 17 35
## Positive Positive Negative
## 21 135
sum(summary(order_checker))
## [1] 214
summary(order_checker) / length(order_checker) * 100
## Negative Neutral Neutral Negative Neutral Positive
## 1.8691589 0.9345794 7.9439252 16.3551402
## Positive Positive Negative
## 9.8130841 63.0841121
pie_df = tibble(names = factor(names(summary(order_checker)), levels = order_checker_levels), values = summary(order_checker))
pie_order_1 = ggplot(pie_df, aes(x="", y=values, fill=names))+
geom_bar(width = 1, stat = "identity") +
coord_polar("y") +
theme_minimal() +
theme(axis.title.x = element_blank(), axis.title.y = element_blank(), panel.border = element_blank(), panel.grid=element_blank(), axis.ticks = element_blank(), plot.title=element_text(size=14, face="bold"))
pie_order_1
#ggsave("fig_1C.svg", pie_order_1, width = 180/2, height = 247/4, dpi = 300, units = "mm")
order_checker = higher_all_sign_check %>%
filter(mutations == "Order 2") %>%
pull(type)
summary(order_checker)
## Negative Neutral Neutral Negative Neutral Positive
## 5 5 37 63
## Positive Positive Negative
## 13 296
sum(summary(order_checker))
## [1] 419
summary(order_checker) / length(order_checker) * 100
## Negative Neutral Neutral Negative Neutral Positive
## 1.193317 1.193317 8.830549 15.035800
## Positive Positive Negative
## 3.102625 70.644391
pie_df = tibble(names = factor(names(summary(order_checker)), levels = order_checker_levels), values = summary(order_checker))
pie_order_2 = ggplot(pie_df, aes(x="", y=values, fill=names))+
geom_bar(width = 1, stat = "identity") +
coord_polar("y") +
theme_minimal() +
theme(axis.title.x = element_blank(), axis.title.y = element_blank(), panel.border = element_blank(), panel.grid=element_blank(), axis.ticks = element_blank(), plot.title=element_text(size=14, face="bold"))
pie_order_2
#ggsave("fig_2B_1.svg", pie_order_2, width = 180/2, height = 247/4, dpi = 300, units = "mm")
order_checker = higher_all_sign_check %>%
filter(mutations == "Order 3") %>%
pull(type)
summary(order_checker)
## Negative Neutral Neutral Negative Neutral Positive
## 15 10 63 59
## Positive Positive Negative
## 51 240
sum(summary(order_checker))
## [1] 438
summary(order_checker) / length(order_checker) * 100
## Negative Neutral Neutral Negative Neutral Positive
## 3.424658 2.283105 14.383562 13.470320
## Positive Positive Negative
## 11.643836 54.794521
pie_df = tibble(names = factor(names(summary(order_checker)), levels = order_checker_levels), values = summary(order_checker))
pie_order_3 = ggplot(pie_df, aes(x="", y=values, fill=names))+
geom_bar(width = 1, stat = "identity") +
coord_polar("y") +
theme_minimal() +
theme(axis.title.x = element_blank(), axis.title.y = element_blank(), panel.border = element_blank(), panel.grid=element_blank(), axis.ticks = element_blank(), plot.title=element_text(size=14, face="bold"))
pie_order_3
#ggsave("fig_2B_2.svg", pie_order_3, width = 180/2, height = 247/4, dpi = 300, units = "mm")
order_checker = higher_all_sign_check %>%
filter(mutations == "Order 4") %>%
pull(type)
summary(order_checker)
## Negative Neutral Neutral Negative Neutral Positive
## 11 5 50 42
## Positive Positive Negative
## 11 126
sum(summary(order_checker))
## [1] 245
summary(order_checker) / length(order_checker) * 100
## Negative Neutral Neutral Negative Neutral Positive
## 4.489796 2.040816 20.408163 17.142857
## Positive Positive Negative
## 4.489796 51.428571
pie_df = tibble(names = factor(names(summary(order_checker)), levels = order_checker_levels), values = summary(order_checker))
pie_order_4 = ggplot(pie_df, aes(x="", y=values, fill=names))+
geom_bar(width = 1, stat = "identity") +
coord_polar("y") +
theme_minimal() +
theme(axis.title.x = element_blank(), axis.title.y = element_blank(), panel.border = element_blank(), panel.grid=element_blank(), axis.ticks = element_blank(), plot.title=element_text(size=14, face="bold"))
pie_order_4
#ggsave("fig_2B_3.svg", pie_order_4, width = 180/2, height = 247/4, dpi = 300, units = "mm")
Raw Values
higher_all_sign_check_table = higher_all_sign_check %>%
group_by(mutations) %>%
count(type) %>%
pivot_wider(names_from = type,
values_from = n) %>%
mutate(Total = sum(across("Negative":"Positive Negative")))
higher_all_sign_check_table %>% knitr::kable()
| mutations | Negative | Neutral | Neutral Negative | Neutral Positive | Positive | Positive Negative | Total |
|---|---|---|---|---|---|---|---|
| Order 1 | 4 | 2 | 17 | 35 | 21 | 135 | 214 |
| Order 2 | 5 | 5 | 37 | 63 | 13 | 296 | 419 |
| Order 3 | 15 | 10 | 63 | 59 | 51 | 240 | 438 |
| Order 4 | 11 | 5 | 50 | 42 | 11 | 126 | 245 |
| Order 5 | 5 | 4 | 12 | 18 | 6 | 39 | 84 |
| Order 6 | 1 | 2 | 2 | 2 | 1 | 6 | 14 |
Percentages
higher_all_sign_check_table %>%
mutate(Negative = Negative/Total*100,
Neutral = Neutral/Total*100,
`Neutral Negative` = `Neutral Negative`/Total*100,
`Neutral Positive` = `Neutral Positive`/Total*100,
Positive = Positive/Total*100,
`Positive Negative` = `Positive Negative`/Total*100) %>%
knitr::kable()
| mutations | Negative | Neutral | Neutral Negative | Neutral Positive | Positive | Positive Negative | Total |
|---|---|---|---|---|---|---|---|
| Order 1 | 1.869159 | 0.9345794 | 7.943925 | 16.35514 | 9.813084 | 63.08411 | 214 |
| Order 2 | 1.193317 | 1.1933174 | 8.830549 | 15.03580 | 3.102625 | 70.64439 | 419 |
| Order 3 | 3.424657 | 2.2831050 | 14.383562 | 13.47032 | 11.643836 | 54.79452 | 438 |
| Order 4 | 4.489796 | 2.0408163 | 20.408163 | 17.14286 | 4.489796 | 51.42857 | 245 |
| Order 5 | 5.952381 | 4.7619048 | 14.285714 | 21.42857 | 7.142857 | 46.42857 | 84 |
| Order 6 | 7.142857 | 14.2857143 | 14.285714 | 14.28571 | 7.142857 | 42.85714 | 14 |
Purpose of this analysis was to determine the difference in \(\Delta\)Function between the WT-background contribution of a given position/combination and the mean contribution of a given position or combination
The spread of all positional contribution vs mean differences for each Order (1-4)
higher_df_mean = higher_df %>%
filter(mutations < 5) %>%
filter(str_count(genotype, "x")/str_length(genotype) != 1) %>% # Remove interactions with no other backgrounds
group_by(unique_id, partial_id) %>%
summarise(avg = mean(avg))
devs = c()
sign_devs = c()
cur_mutations = c()
for(i in 1:dim(higher_df_mean)[1]) {
curr_id = higher_df_mean$unique_id[i]
devs = c(devs, abs( (higher_df %>% filter(!str_detect(genotype, "1")) %>% filter(unique_id == curr_id) %>% pull(avg)) - (higher_df_mean$avg[i]) ))
cur_mutations = c(cur_mutations, (higher_df %>% filter(!str_detect(genotype, "1")) %>% filter(unique_id == curr_id) %>% pull(mutations)))
sign_devs = c(sign_devs, ifelse( (((higher_df %>% filter(!str_detect(genotype, "1")) %>% filter(unique_id == curr_id) %>% pull(avg)) > log10(1.5) & (higher_df_mean$avg[i]) < log10(1/1.5)) | ((higher_df %>% filter(!str_detect(genotype, "1")) %>% filter(unique_id == curr_id) %>% pull(avg)) < log10(1/1.5) & (higher_df_mean$avg[i]) > log10(1.5))), T, F)) ## Check if WT is positive while average is negative or vice versa
}
devs_df_wt = data.frame(devs = devs, muts = cur_mutations, sign_devs = sign_devs)
## Add unique ID if the WT deviates from mean
devs_df_wt = devs_df_wt %>%
mutate(unique_id = higher_df_mean$unique_id[row_number()]) %>%
mutate(partial_id = higher_df_mean$partial_id[row_number()])
How many WT points have sign deviation from the mean at the 1st order?
paste0(length(which(devs_df_wt %>% filter(muts == 1) %>% pull(sign_devs))), "/", length(devs_df_wt %>% filter(muts == 1) %>% pull(sign_devs)))
## [1] "7/214"
paste0(round(sum(devs_df_wt %>% filter(muts == 1) %>% pull(sign_devs)) / length(devs_df_wt %>% filter(muts == 1) %>% pull(sign_devs)) * 100, 2), "%")
## [1] "3.27%"
How many WT points have sign deviation from the mean at the 2nd order?
paste0(length(which(devs_df_wt %>% filter(muts == 2) %>% pull(sign_devs))), "/", length(devs_df_wt %>% filter(muts == 2) %>% pull(sign_devs)))
## [1] "53/419"
paste0(round(sum(devs_df_wt %>% filter(muts == 2) %>% pull(sign_devs)) / length(devs_df_wt %>% filter(muts == 2) %>% pull(sign_devs)) * 100, 2), "%")
## [1] "12.65%"
How many WT points have sign deviation from the mean at the 3rd order?
paste0(length(which(devs_df_wt %>% filter(muts == 3) %>% pull(sign_devs))), "/", length(devs_df_wt %>% filter(muts == 3) %>% pull(sign_devs)))
## [1] "34/438"
paste0(round(sum(devs_df_wt %>% filter(muts == 3) %>% pull(sign_devs)) / length(devs_df_wt %>% filter(muts == 3) %>% pull(sign_devs)) * 100, 2), "%")
## [1] "7.76%"
How many WT points have sign deviation from the mean at the 4th order?
paste0(length(which(devs_df_wt %>% filter(muts == 4) %>% pull(sign_devs))), "/", length(devs_df_wt %>% filter(muts == 4) %>% pull(sign_devs)))
## [1] "12/245"
paste0(round(sum(devs_df_wt %>% filter(muts == 4) %>% pull(sign_devs)) / length(devs_df_wt %>% filter(muts == 4) %>% pull(sign_devs)) * 100, 2), "%")
## [1] "4.9%"
Next we move away from sign to normal distance heterogenity
devs_df_wt_1_plot = devs_df_wt %>%
filter(muts == 1) %>%
ggplot(aes(x = devs)) +
geom_histogram(binwidth=0.08, color = "black", lwd = 0.1, fill = "#edae49") +
geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
labs(x = expression("Deviation from Positional Mean ("*Delta*italic("F")*")"),
y = "Position Count") +
theme_classic() +
theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")
devs_df_wt_1_plot
#ggsave("fig_1D.svg", devs_df_wt_1_plot, width = 180/2, height = 247/4, dpi = 300, units = "mm")
devs_df_wt_2 = devs_df_wt %>%
filter(muts == 2) %>%
ggplot(aes(x = devs)) +
geom_histogram(binwidth=0.08, color = "black", lwd = 0.1, fill = "#edae49") +
geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
theme_classic() +
#scale_x_continuous(limits = c(0, 3)) +
labs(x = expression("Deviation from Combinatorial Mean (log"[10]*" "*epsilon*")"),
y = "Combination Count") +
theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")
devs_df_wt_2
#ggsave("fig_sup_4B_1.svg", devs_df_wt_2, width = 180/3, height = 247/4, dpi = 300, units = "mm")
devs_df_wt_3 = devs_df_wt %>%
filter(muts == 3) %>%
ggplot(aes(x = devs)) +
geom_histogram(binwidth=0.08, color = "black", lwd = 0.1, fill = "#edae49") +
geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
#scale_x_continuous(limits = c(0, 3)) +
theme_classic() +
labs(x = expression("Deviation from Combinatorial Mean (log"[10]*" "*epsilon*")"),
y = "Combination Count") +
theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")
devs_df_wt_3
#ggsave("fig_sup_4B_2.svg", devs_df_wt_3, width = 180/3, height = 247/4, dpi = 300, units = "mm")
devs_df_wt_4 = devs_df_wt %>%
filter(muts == 4) %>%
ggplot(aes(x = devs)) +
geom_histogram(binwidth=0.08, color = "black", lwd = 0.1, fill = "#edae49") +
geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
theme_classic() +
#scale_x_continuous(limits = c(0, 3)) + # Same scale used later
labs(x = expression("Deviation from Combinatorial Mean (log"[10]*" "*epsilon*")"),
y = "Combination Count") +
theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")
devs_df_wt_4
#ggsave("fig_sup_4B_3.svg", devs_df_wt_4, width = 180/3, height = 247/4, dpi = 300, units = "mm")
devs_df_wt_multi = devs_df_wt %>%
filter(muts == 2 | muts == 3 | muts == 4) %>%
ggplot(aes(x = devs)) +
geom_histogram(binwidth=0.1, color = "black", lwd = 0.1, fill = "#edae49") +
geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
facet_grid(~ muts) +
scale_x_continuous(limits = c(0, 3)) +
theme_classic() +
labs(x = expression("Deviation from Combinatorial Mean ("*epsilon*")"),
y = "Combination Count") +
theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")
devs_df_wt_multi
## Warning: Removed 8 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 6 rows containing missing values (`geom_bar()`).
#ggsave("fig_2C.svg", devs_df_wt_multi, width = 180, height = 247/4, dpi = 300, units = "mm")
Table to show percentages of WT-bg points that deviate from the positional/combinatorial mean by significance threshold (log10 1.5-fold)
devs_df_wt %>%
mutate(muts = factor(muts)) %>%
group_by(muts) %>%
summarise(percent = sum(devs > log10(1.5)) / length(devs),
outlier = sum(devs > log10(1.5)),
total = length(devs)) %>%
knitr::kable()
| muts | percent | outlier | total |
|---|---|---|---|
| 1 | 0.6775701 | 145 | 214 |
| 2 | 0.6443914 | 270 | 419 |
| 3 | 0.5593607 | 245 | 438 |
| 4 | 0.6244898 | 153 | 245 |
Table to show percentages of WT-bg points that deviate from the positional/combinatorial mean by multiple significance threshold (log10 1.5-, 2-, 5-, and 10-fold)
devs_df_wt %>%
mutate(Order = factor(muts)) %>%
group_by(Order) %>%
summarise(percent_1.5 = round(sum(devs > log10(1.5)) / length(devs) *100, 1),
outlier_1.5 = sum(devs > log10(1.5)),
percent_2 = round(sum(devs > log10(2)) / length(devs) *100, 1),
outlier_2 = sum(devs > log10(2)),
percent_5 = round(sum(devs > log10(5)) / length(devs) *100, 1),
outlier_5 = sum(devs > log10(5)),
percent_10 = round(sum(devs > log10(10)) / length(devs) *100, 1),
outlier_10 = sum(devs > log10(10)),
total = length(devs)) %>%
knitr::kable()
| Order | percent_1.5 | outlier_1.5 | percent_2 | outlier_2 | percent_5 | outlier_5 | percent_10 | outlier_10 | total |
|---|---|---|---|---|---|---|---|---|---|
| 1 | 67.8 | 145 | 51.4 | 110 | 19.2 | 41 | 9.3 | 20 | 214 |
| 2 | 64.4 | 270 | 51.3 | 215 | 21.5 | 90 | 9.8 | 41 | 419 |
| 3 | 55.9 | 245 | 35.2 | 154 | 19.2 | 84 | 13.9 | 61 | 438 |
| 4 | 62.4 | 153 | 33.9 | 83 | 9.0 | 22 | 6.9 | 17 | 245 |
Purpose of this analysis was to determine the difference in \(\Delta\)Function between functional contribution of a given position/combination in any background and the mean contribution of a given position or combination
The spread of all positional contribution vs mean differences for each Order (1-4)
devs = c()
cur_mutations = c()
cur_partial_id = c()
for(i in 1:dim(higher_df_mean)[1]) {
curr_id = higher_df_mean$unique_id[i]
devs = c(devs, abs( (higher_df %>% filter(unique_id == curr_id) %>% pull(avg)) - (higher_df_mean$avg[i] ) ))
cur_mutations = c(cur_mutations, (higher_df %>% filter(unique_id == curr_id) %>% pull(mutations)))
cur_partial_id = c(cur_partial_id, (higher_df %>% filter(unique_id == curr_id) %>% pull(partial_id)))
}
devs_df = data.frame(devs = devs, muts = cur_mutations, partial_id = cur_partial_id)
devs_df %>%
filter(muts == 1) %>%
ggplot(aes(x = devs)) +
geom_histogram(binwidth=0.08, color = "black", lwd=0.1, fill = "#edae49") +
geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
theme_classic() +
labs(x = expression("Deviation from Positional Mean ("*Delta*italic("F")*")"),
y = "Genotype Count") +
theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")
devs_df %>%
filter(muts == 2) %>%
ggplot(aes(x = devs)) +
geom_histogram(binwidth=0.08, color = "black", lwd=0.1, fill = "#edae49") +
geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
theme_classic() +
labs(x = expression("Deviation from Positional Mean ("*Delta*italic("F")*")"),
y = "Genotype Count") +
theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")
devs_df %>%
filter(muts == 3) %>%
ggplot(aes(x = devs)) +
geom_histogram(binwidth=0.08, color = "black", lwd=0.1, fill = "#edae49") +
geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
theme_classic() +
labs(x = expression("Deviation from Positional Mean ("*Delta*italic("F")*")"),
y = "Genotype Count") +
theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")
devs_df %>%
filter(muts == 4) %>%
ggplot(aes(x = devs)) +
geom_histogram(binwidth=0.08, color = "black", lwd=0.1, fill = "#edae49") +
geom_vline(xintercept = log10(1.5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(2), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(5), lty = 2, lwd = 0.2) +
geom_vline(xintercept = log10(10), lty = 2, lwd = 0.2) +
theme_classic() +
labs(x = expression("Deviation from Positional Mean ("*Delta*italic("F")*")"),
y = "Genotype Count") +
theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none")
Table to show percentages of WT-bg points that deviate from the positional/combinatorial mean by significance threshold (log10 1.5-fold)
devs_df %>%
mutate(Order = factor(muts)) %>%
group_by(Order) %>%
summarise(percent = sum(devs > log10(1.5)) / length(devs),
outlier = sum(devs > log10(1.5)),
total = length(devs)) %>%
knitr::kable()
| Order | percent | outlier | total |
|---|---|---|---|
| 1 | 0.5888287 | 2393 | 4064 |
| 2 | 0.5286815 | 2470 | 4672 |
| 3 | 0.4963038 | 1477 | 2976 |
| 4 | 0.5598214 | 627 | 1120 |
Table to show percentages of WT-bg points that deviate from the positional/combinatorial mean by many significance thresholds (log10 1.5-, 2-, 5-, and 10-fold)
devs_df %>%
mutate(Order = factor(muts)) %>%
group_by(Order) %>%
summarise(percent_1.5 = sum(devs > log10(1.5)) / length(devs),
outlier_1.5 = sum(devs > log10(1.5)),
percent_2 = sum(devs > log10(2)) / length(devs),
outlier_2 = sum(devs > log10(2)),
percent_5 = sum(devs > log10(5)) / length(devs),
outlier_5 = sum(devs > log10(5)),
percent_10 = sum(devs > log10(10)) / length(devs),
outlier_10 = sum(devs > log10(10)),
total = length(devs)) %>%
knitr::kable()
| Order | percent_1.5 | outlier_1.5 | percent_2 | outlier_2 | percent_5 | outlier_5 | percent_10 | outlier_10 | total |
|---|---|---|---|---|---|---|---|---|---|
| 1 | 0.5888287 | 2393 | 0.4040354 | 1642 | 0.1483760 | 603 | 0.0809547 | 329 | 4064 |
| 2 | 0.5286815 | 2470 | 0.3488870 | 1630 | 0.1316353 | 615 | 0.0747003 | 349 | 4672 |
| 3 | 0.4963038 | 1477 | 0.3114919 | 927 | 0.1202957 | 358 | 0.0796371 | 237 | 2976 |
| 4 | 0.5598214 | 627 | 0.3455357 | 387 | 0.0991071 | 111 | 0.0589286 | 66 | 1120 |
This sections attempts to probe if the previous values from the statistical analysis are representative in the individual data sets
d_individual = d %>% filter_all(any_vars(!is.na(.))) # removes rows with all NA before making unique ID
d_individual = d_individual %>%
unite("landscape_id", c(enzyme, type, cond), remove = F)
landscape_ids = unique(d_individual$landscape_id)
general_df = tibble()
for(i in 1:length(landscape_ids)) {
d_curr_individual = d_individual %>% filter(landscape_id == landscape_ids[i])
general = c(unique(d_curr_individual$enzyme),
landscape_ids[i],
sum(d_curr_individual$effects < log10(1/1.5)) / length(d_curr_individual$effects),
sum(d_curr_individual$effects <= log10(1.5) & d_curr_individual$effects >= log10(1/1.5)) / length(d_curr_individual$effects),
sum(d_curr_individual$effects > log10(1.5)) / length(d_curr_individual$effects)
)
general_df = rbind(general_df, general)
}
colnames(general_df) = c("Enzyme", "Landscape ID", "Negative", "Neutral", "Positive")
general_df[-1]
## Landscape ID Negative Neutral Positive
## 1 AP_catef_1 0.8375 0.025 0.1375
## 2 DHFR_ic50_c57 0.375 0.09375 0.53125
## 3 DHFR_ic50_c58 0.375 0.09375 0.53125
## 4 DHFR_ic50_c59 0.46875 0.15625 0.375
## 5 DHFR_ic50_c60 0.375 0.21875 0.40625
## 6 DHFR_ic50_c61 0.125 0.3125 0.5625
## 7 DHFR_ic75_palmer 0.223958333333333 0.25 0.526041666666667
## 8 DHFR_kcat_trajg 0.6625 0.2 0.1375
## 9 DHFR_kcat_trajr 0.5875 0.2875 0.125
## 10 DHFR_ki_trajg 0 0 1
## 11 DHFR_ki_trajr 0 0 1
## 12 MPH_catact_CaPTM 0.15 0.1875 0.6625
## 13 MPH_catact_CdPTM 0.1375 0.3375 0.525
## 14 MPH_catact_CoPTM 0.2 0.0875 0.7125
## 15 MPH_catact_CuPTM 0.225 0.1625 0.6125
## 16 MPH_catact_MgPTM 0.2125 0.25 0.5375
## 17 MPH_catact_MnPTM 0.25 0.1875 0.5625
## 18 MPH_catact_NiPTM 0.2 0.1625 0.6375
## 19 MPH_catact_ZnPTM 0.2 0.175 0.625
## 20 NfsA_ec50_2039 0.09375 0.627232142857143 0.279017857142857
## 21 NfsA_ec50_3637 0.0982142857142857 0.529017857142857 0.372767857142857
## 22 OXA-48_ic50_CAZtraj1 0 0.4375 0.5625
## 23 OXA-48_ic50_CAZtraj2 0.0729166666666667 0.333333333333333 0.59375
## 24 OXA-48_ic50_CAZtraj3 0.03125 0.677083333333333 0.291666666666667
## 25 OXA-48_ic50_PIPtraj1 0.375 0.375 0.25
## 26 OXA-48_ic50_PIPtraj2 0.369791666666667 0.473958333333333 0.15625
## 27 OXA-48_ic50_PIPtraj3 0.3125 0.583333333333333 0.104166666666667
## 28 PTE_catact_2NH 0.0885416666666667 0.239583333333333 0.671875
## 29 PTE_catact_butyrate 0.239583333333333 0.317708333333333 0.442708333333333
## 30 TEM_growth_AM 0.3125 0.3125 0.375
## 31 TEM_growth_AMC 0.28125 0.40625 0.3125
## 32 TEM_growth_AMP 0.3125 0.40625 0.28125
## 33 TEM_growth_CAZ 0.28125 0.25 0.46875
## 34 TEM_growth_CEC 0.40625 0.28125 0.3125
## 35 TEM_growth_CPD 0.25 0.1875 0.5625
## 36 TEM_growth_CPR 0.4375 0.40625 0.15625
## 37 TEM_growth_CRO 0.375 0.09375 0.53125
## 38 TEM_growth_CTT 0.4375 0.0625 0.5
## 39 TEM_growth_CTX 0.1875 0.46875 0.34375
## 40 TEM_growth_CXM 0.28125 0.21875 0.5
## 41 TEM_growth_FEP 0.21875 0.25 0.53125
## 42 TEM_growth_SAM 0.125 0.3125 0.5625
## 43 TEM_growth_TZP 0.5625 0.21875 0.21875
## 44 TEM_growth_ZOX 0.25 0.3125 0.4375
## 45 TEM_MIC_weinreich 0.0375 0.3 0.6625
The trend appears to be heterogenous, i.e. the fraction of negative, neutral, and positive is varied across all landscapes but averages out to roughly equal proportions when all landscapes are considered together.
For example, AP shows very negative contribution at 77.5% of all mutations, DHFR kcats are predominantly negative while Kis are almost all positive (due to the adaptive nature of the mutations). MPH shows strong positives as it is also adaptive. NfsA is predominantly neutral, with some positive contribution. OXA is positive and neutral for adaptive trajectories, and negative and neutral for trade-off. PTE is positive for its adaptive substrate, but shows more negative behavior for lactones and OPs. TEM across all antibiotics is predominantly neutral (no directy adaptation). Finally the adaptive MIC of TEM is mostly positive with 66.3%.
general_df %>%
pivot_longer(c(3,4,5)) %>%
mutate(value = as.numeric(value)*100) %>%
ggplot(aes(x = name, y = value, color = Enzyme)) +
geom_point() +
stat_summary(fun=mean, colour="darkred", geom="crossbar", width=0.2) +
theme_classic()
Import the absolute error files for each data set
all_mae = lapply(paste0("Output/", list.files(path = "Output", pattern="aic_*", recursive = T)), read_csv, skip = 1, show_col_types=F, col_names = c("MAE", "Last MAE", "Step", "Enzyme", "Linear MAE", "Linear Last MAE"))
for(element in 1:length(all_mae)){
all_mae[[element]]$ID = str_split(list.files(path = "Output", pattern="aic_*", recursive = T)[element], "/")[[1]][1]
}
all_mae = do.call("rbind", all_mae)
For some reason the absolute error files for OXA-48 CAZ and PIP Traj 1 are including the step 4 model, which uses all 4 of the coefficients in the model for prediction, which is results in a perfect prediction for the WT-background model. I can just remove it manually here
all_mae = all_mae %>% filter(!((ID == "OXA-48_ic50_CAZtraj1" | ID == "OXA-48_ic50_PIPtraj1") & Step == 4))
Some statistics for the AE for each model. Now F is converted to fold-change in function (but remains WT-normalized)
all_mae %>%
group_by(Step) %>%
summarise(wtbg_mean = 10^mean(`Last MAE`),
wtbg_median = 10^median(`Last MAE`),
lin_mean = 10^mean(`Linear Last MAE`),
lin_median = 10^median(`Linear Last MAE`)) %>%
knitr::kable()
| Step | wtbg_mean | wtbg_median | lin_mean | lin_median |
|---|---|---|---|---|
| 1 | 19.507536 | 8.008615 | 2.889882 | 2.302982 |
| 2 | 111.737428 | 63.740988 | 2.016922 | 1.765125 |
| 3 | 51.067872 | 8.952979 | 1.273370 | 1.192176 |
| 4 | 28.215338 | 4.499601 | 1.105085 | 1.066184 |
| 5 | 7.322787 | 1.313971 | 1.094891 | 1.080794 |
| 6 | 1.730950 | 1.000000 | 1.114261 | 1.106314 |
| 7 | 1.000000 | 1.000000 | 1.087920 | 1.087920 |
Are the means significantly less than 1.5-fold?
all_mae %>%
group_by(Step) %>%
summarise(wtbg_pval = t.test(`Last MAE`, mu = log10(1.5), alternative = "less")$p.value,
lin_mean = t.test(`Linear Last MAE`, mu = log10(1.5), alternative = "less")$p.value) %>%
filter(Step < 5)
## # A tibble: 4 × 3
## Step wtbg_pval lin_mean
## <dbl> <dbl> <dbl>
## 1 1 1.00 1.00e+ 0
## 2 2 1.00 9.99e- 1
## 3 3 1.00 1.33e- 6
## 4 4 0.994 2.43e-12
Only the linear model gets there for the 3rd order onwards.
After loading and assigning ID corresponding to each landscape value (should be 45) we can plot linear and WT-model MAE’s with their constituents
all_mae_long = all_mae %>%
pivot_longer(cols = c('Last MAE', 'Linear Last MAE'), names_to = "Model", values_to = "AE") %>%
filter(Step < 5)
mae_ggplot = all_mae_long %>%
ggplot(aes(x = Step, y = AE, color = Model)) +
geom_point(size = 1, position=position_jitterdodge(jitter.width = 0.1)) +
stat_summary(data = all_mae_long %>% filter(Model == "Last MAE"), fun=mean, colour="black", geom="crossbar", width=0.2, position = position_nudge(x = -0.19, y = 0), size = 0.3) +
stat_summary(data = all_mae_long %>% filter(Model == "Linear Last MAE"), fun=mean, colour="black", geom="crossbar", width=0.2, position = position_nudge(x = 0.19, y = 0),size = 0.3) +
stat_summary(data = all_mae_long %>% filter(Model == "Last MAE"), fun=median, colour="darkred", geom="crossbar", width=0.2, position = position_nudge(x = -0.19, y = 0), size = 0.3) +
stat_summary(data = all_mae_long %>% filter(Model == "Linear Last MAE"), fun=median, colour="darkred", geom="crossbar", width=0.2, position = position_nudge(x = 0.19, y = 0),size = 0.3) +
geom_hline(yintercept = log10(1.5), lty = 2, size = 0.2) +
scale_color_manual(values = c("#edae49", "#4988ed")) +
labs(x = "Order of the Model",
y = expression("Absolute Error of Predicted"*italic(" F"))) +
theme_classic() +
theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"), legend.position = "none") +
coord_cartesian(ylim=c(0, 6)) # Zoom without affecting means and medians
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
mae_ggplot
#ggsave("fig_3B.svg", mae_ggplot, width = 180/2, height = 247/4, dpi = 300, units = "mm")
This plot omits some data from being viewed, however plotted means and medians are accurate. The figure is edited outside of the code for technical
What is the distribution of predicted vs non-predicted at each order, not just the means?
all_mae_long %>%
group_by(Step, Model) %>%
summarise(pred= length(AE[AE < log10(1.5)]),
non_pred= length(AE[AE >= log10(1.5)])) %>%
mutate(pred_ratio = round(pred/(pred+non_pred)*100,2)) %>%
knitr::kable()
## `summarise()` has grouped output by 'Step'. You can override using the
## `.groups` argument.
| Step | Model | pred | non_pred | pred_ratio |
|---|---|---|---|---|
| 1 | Last MAE | 6 | 39 | 13.33 |
| 1 | Linear Last MAE | 13 | 32 | 28.89 |
| 2 | Last MAE | 5 | 40 | 11.11 |
| 2 | Linear Last MAE | 15 | 30 | 33.33 |
| 3 | Last MAE | 5 | 40 | 11.11 |
| 3 | Linear Last MAE | 34 | 11 | 75.56 |
| 4 | Last MAE | 2 | 21 | 8.70 |
| 4 | Linear Last MAE | 22 | 1 | 95.65 |
First I defined the adaptive trajectories. These are hardcoded based on the dataset I am working with.
adaptive_trajectories = c("DHFR_ki_trajg",
"DHFR_ki_trajr",
"MPH_catact_ZnPTM",
"NfsA_ec50_2039",
"NfsA_ec50_3637",
"OXA-48_ic50_CAZtraj1",
"OXA-48_ic50_CAZtraj2",
"OXA-48_ic50_CAZtraj3",
"PTE_catact_2NH",
"TEM_MIC_weinreich",
"DHFR_ic75_palmer")
Then I need to create a global trajectory data frame
all_traj = do.call("rbind", lapply(paste0("Output/", list.files(path = "Output", pattern="traj_epi_*", recursive = T)), read_csv, skip = 1, show_col_types=F, col_names = c("genotype", "avg", "pos", "mutations", "likely", "enzyme_name", "measure_type", "cond")))
all_traj = all_traj %>% unite("unique_id", c(enzyme_name, measure_type, cond), remove = F)
Next I need to filter landscapes by whether they are adaptive or not
adaptive_traj = all_traj %>% filter(unique_id %in% adaptive_trajectories)
Which genotypes in the adaptive landscapes for the most accessible
trajectory, as defined by the boolean variable likely, are
POSITIVELY epistatic and epistatic in general?
Note: some trajectories don’t go to the last variant, so we explore epistasis of all other genotypes which may go beyond last variant
adaptive_traj_favored = adaptive_traj %>% filter(mutations > 1 & likely) %>% pull(avg)
sum(adaptive_traj_favored > log10(1.5)) / length(adaptive_traj_favored)
## [1] 0.3809524
sum(adaptive_traj_favored > log10(1.5) | adaptive_traj_favored < log10(1/1.5)) / length(adaptive_traj_favored)
## [1] 0.6428571
How many positive epistatic vs all epistatic genotypes vs all genotypes encountered in these landscapes?
sum(adaptive_traj_favored > log10(1.5))
## [1] 16
sum(adaptive_traj_favored > log10(1.5) | adaptive_traj_favored < log10(1/1.5))
## [1] 27
length(adaptive_traj_favored)
## [1] 42
Which genotypes in the non-adaptive landscapes for the favored trajectory are NEGATIVELY epistatic and epistatic in general? Also total genotypes that are not most accessible, with percentages of negative epistasis and epistasis in general.
`%notin%` <- Negate(`%in%`)
non_adaptive_traj_favored = all_traj %>% filter(unique_id %notin% adaptive_trajectories & mutations > 1 & !likely) %>% pull(avg)
sum(non_adaptive_traj_favored < log10(1/1.5))
## [1] 204
sum(non_adaptive_traj_favored > log10(1.5) | non_adaptive_traj_favored < log10(1/1.5))
## [1] 408
length(non_adaptive_traj_favored)
## [1] 557
sum(non_adaptive_traj_favored < log10(1/1.5)) / length(non_adaptive_traj_favored)
## [1] 0.3662478
sum(non_adaptive_traj_favored > log10(1.5) | non_adaptive_traj_favored < log10(1/1.5)) / length(non_adaptive_traj_favored)
## [1] 0.7324955
We construct thee plotting_d tibble which has
information about the observed effect, linear model effect which is
simply ccalled epistatic effect and the effect of each
order of the WT-background model
adaptive_traj_fave = adaptive_traj %>% filter(likely)
###
plotting_d = tibble()
dummy2 = data.frame()
for(i in 1:length(adaptive_trajectories)) {
suffix = tail(str_split(adaptive_trajectories[i], "_")[[1]], 1)
file = paste0("pred_df_", suffix, "_", adaptive_trajectories[i], ".csv")
d = read_csv(paste0("Output/", adaptive_trajectories[i], "/", file), show_col_types=F)
genos = str_replace_all(adaptive_traj_fave[which(adaptive_traj_fave$unique_id %in% adaptive_trajectories[i]),]$genotype, "x", "1")
genos = c(str_replace_all(genos[1], "1", "0"), genos)
dummy2 = rbind(dummy2, data.frame(trajectory = adaptive_trajectories[i], Z = d[which(d$numbers == tail(genos, 1)),]$`observed effect`))
others = c()
for(j in 1:(length(genos) - 2)) {
others = c(others, colnames(d)[which(colnames(d) == "mutations") + j]) ## Get all steps before final step of evaluated genotype
}
plotting_d = rbind(plotting_d, d[which(d$numbers %in% genos),] %>%
pivot_longer(cols = c(`observed effect`,
`epistatic effect`,
`WT effect`,
others)) %>% ## add those steps into line plots
select(c(mutations, name, value)) %>%
mutate(trajectory = adaptive_trajectories[i]))
}
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(others)
##
## # Now:
## data %>% select(all_of(others))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Next we plot all of those
ggplot(plotting_d, aes(x = mutations, y = value, color = name)) +
geom_line(lwd = 2) +
facet_wrap(~ trajectory) +
geom_hline(data = dummy2, aes(yintercept = Z)) +
geom_hline(data = dummy2, aes(yintercept = Z + log10(1.5)), lty = 2) +
geom_hline(data = dummy2, aes(yintercept = Z - log10(1.5)), lty = 2) +
geom_hline(data = dummy2, aes(yintercept = Z + log10(10)), lty = 2, color = "grey") +
geom_hline(data = dummy2, aes(yintercept = Z - log10(10)), lty = 2, color = "grey") +
theme_classic()
Also return as a table
plotting_d %>%
pivot_wider(names_from = name, values_from = value) %>%
knitr::kable()
| mutations | trajectory | observed effect | epistatic effect | WT effect | 2 step | 3 step | 4 step | final step | 5 step | 6 step |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | DHFR_ki_trajg | 0.0000000 | -0.0124266 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | NA | NA |
| 1 | DHFR_ki_trajg | 1.1303338 | 1.1421598 | 1.1303338 | 1.1303338 | 1.1303338 | 1.1303338 | 1.1303338 | NA | NA |
| 2 | DHFR_ki_trajg | 2.3598355 | 2.3480971 | 1.7504698 | 2.3598355 | 2.3598355 | 2.3598355 | 2.3598355 | NA | NA |
| 3 | DHFR_ki_trajg | 3.3541084 | 3.3664861 | 2.0928925 | 3.3782593 | 3.3541084 | 3.3541084 | 3.3541084 | NA | NA |
| 4 | DHFR_ki_trajg | 4.5820634 | 4.5691010 | 2.4392455 | 6.4837389 | 4.2208458 | 4.5820634 | 4.5820634 | NA | NA |
| 5 | DHFR_ki_trajg | 5.9656720 | 5.9782807 | 2.7914280 | 9.0023947 | 3.9479878 | 6.3680742 | 5.9656720 | NA | NA |
| 0 | DHFR_ki_trajr | 0.0000000 | -0.0155046 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | NA | NA |
| 1 | DHFR_ki_trajr | 1.1303338 | 1.1452378 | 1.1303338 | 1.1303338 | 1.1303338 | 1.1303338 | 1.1303338 | NA | NA |
| 2 | DHFR_ki_trajr | 2.3598355 | 2.3450191 | 1.7504698 | 2.3598355 | 2.3598355 | 2.3598355 | 2.3598355 | NA | NA |
| 3 | DHFR_ki_trajr | 3.3541084 | 3.3695641 | 2.0928925 | 3.3782593 | 3.3541084 | 3.3541084 | 3.3541084 | NA | NA |
| 4 | DHFR_ki_trajr | 4.2810334 | 4.2664918 | 2.4450750 | 5.1111507 | 4.4545256 | 4.2810334 | 4.2810334 | NA | NA |
| 5 | DHFR_ki_trajr | 5.2304489 | 5.2453065 | 2.8049105 | 7.8594519 | 4.0646531 | 5.7290199 | 5.2304489 | NA | NA |
| 0 | MPH_catact_ZnPTM | 0.0000000 | -0.0016066 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | NA | NA |
| 1 | MPH_catact_ZnPTM | 0.9943172 | 0.9914272 | 0.9943172 | 0.9943172 | 0.9943172 | 0.9943172 | 0.9943172 | NA | NA |
| 2 | MPH_catact_ZnPTM | 1.2068259 | 1.2012842 | 1.2661588 | 1.2068259 | 1.2068259 | 1.2068259 | 1.2068259 | NA | NA |
| 3 | MPH_catact_ZnPTM | 1.2405492 | 1.2353742 | 1.9345447 | 1.2404054 | 1.2405492 | 1.2405492 | 1.2405492 | NA | NA |
| 4 | MPH_catact_ZnPTM | 2.3117539 | 2.3103613 | 2.8654937 | 0.4136588 | 2.0807635 | 2.3117539 | 2.3117539 | NA | NA |
| 5 | MPH_catact_ZnPTM | 2.9609462 | 2.9605514 | 2.0776813 | -0.4173805 | 2.1149224 | 3.6596868 | 2.9609462 | NA | NA |
| 0 | NfsA_ec50_2039 | 0.0000000 | -0.0052604 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | NA | 0.0000000 | NA |
| 1 | NfsA_ec50_2039 | 0.4216039 | 0.4174507 | 0.4216039 | 0.4216039 | 0.4216039 | 0.4216039 | NA | 0.4216039 | NA |
| 2 | NfsA_ec50_2039 | 0.6190933 | 0.6108143 | 0.4785088 | 0.6190933 | 0.6190933 | 0.6190933 | NA | 0.6190933 | NA |
| 3 | NfsA_ec50_2039 | 0.7450748 | 0.7388571 | 0.4244695 | 0.7362056 | 0.7450748 | 0.7450748 | NA | 0.7450748 | NA |
| 4 | NfsA_ec50_2039 | 0.8488047 | 0.8477300 | 0.4121357 | 0.7618568 | 0.7740661 | 0.8488047 | NA | 0.8488047 | NA |
| 5 | NfsA_ec50_2039 | 0.9845273 | 0.9797227 | 0.3801515 | -0.1849312 | 1.2563700 | 1.2151156 | NA | 0.9845273 | NA |
| 0 | NfsA_ec50_3637 | 0.0000000 | -0.0052604 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 |
| 1 | NfsA_ec50_3637 | 0.4800069 | 0.4741952 | 0.4800069 | 0.4800069 | 0.4800069 | 0.4800069 | 0.4800069 | 0.4800069 | 0.4800069 |
| 2 | NfsA_ec50_3637 | 0.6711728 | 0.6693158 | 0.5369118 | 0.6711728 | 0.6711728 | 0.6711728 | 0.6711728 | 0.6711728 | 0.6711728 |
| 3 | NfsA_ec50_3637 | 0.8413595 | 0.8383239 | 0.5245781 | 0.6711856 | 0.8413595 | 0.8413595 | 0.8413595 | 0.8413595 | 0.8413595 |
| 4 | NfsA_ec50_3637 | 0.9180303 | 0.9137495 | 0.4462645 | 0.4576674 | 1.4626969 | 0.9180303 | 0.9180303 | 0.9180303 | 0.9180303 |
| 5 | NfsA_ec50_3637 | 0.9566486 | 0.9519493 | 0.1443651 | 0.8355329 | 1.8320479 | 0.0426191 | 0.9566486 | 0.9566486 | 0.9566486 |
| 6 | NfsA_ec50_3637 | 0.9454686 | 0.9436716 | 0.0727609 | 0.6151170 | 2.4772843 | -0.4757923 | 0.9454686 | 1.6847688 | 0.9454686 |
| 7 | NfsA_ec50_3637 | 1.0043214 | 1.0028427 | 0.0939502 | 1.2003309 | 1.7113959 | -1.2265483 | 1.0043214 | 4.5060330 | -0.9087341 |
| 0 | OXA-48_ic50_CAZtraj1 | 0.0000000 | -0.0026570 | 0.0000000 | 0.0000000 | 0.0000000 | NA | 0.0000000 | NA | NA |
| 1 | OXA-48_ic50_CAZtraj1 | 0.3560259 | 0.3488146 | 0.3560259 | 0.3560259 | 0.3560259 | NA | 0.3560259 | NA | NA |
| 2 | OXA-48_ic50_CAZtraj1 | 1.1398791 | 1.1223656 | 0.3360292 | 1.1398791 | 1.1398791 | NA | 1.1398791 | NA | NA |
| 3 | OXA-48_ic50_CAZtraj1 | 1.4842998 | 1.4833406 | 0.4115762 | 2.0345134 | 1.4842998 | NA | 1.4842998 | NA | NA |
| 4 | OXA-48_ic50_CAZtraj1 | 1.6042261 | 1.5994638 | 0.5255195 | 1.9259530 | 1.3318026 | NA | 1.6042261 | NA | NA |
| 0 | OXA-48_ic50_CAZtraj2 | 0.0000000 | -0.0026570 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | NA |
| 1 | OXA-48_ic50_CAZtraj2 | 0.3856063 | 0.3839493 | 0.3856063 | 0.3856063 | 0.3856063 | 0.3856063 | 0.3856063 | 0.3856063 | NA |
| 2 | OXA-48_ic50_CAZtraj2 | 0.8027737 | 0.7969589 | 0.7694216 | 0.8027737 | 0.8027737 | 0.8027737 | 0.8027737 | 0.8027737 | NA |
| 3 | OXA-48_ic50_CAZtraj2 | 1.0934217 | 1.0916060 | 0.9093007 | 1.4373797 | 1.0934217 | 1.0934217 | 1.0934217 | 1.0934217 | NA |
| 4 | OXA-48_ic50_CAZtraj2 | 1.5276299 | 1.5256020 | 0.9005268 | 2.1488613 | 1.1459227 | 1.5276299 | 1.5276299 | 1.5276299 | NA |
| 5 | OXA-48_ic50_CAZtraj2 | 1.5682017 | 1.5672424 | 0.8881931 | 2.2688061 | 1.6984605 | 1.6998067 | 1.5682017 | 1.5682017 | NA |
| 6 | OXA-48_ic50_CAZtraj2 | 1.5078559 | 1.5046215 | 0.8967932 | 1.9530868 | 0.7115869 | 2.2277266 | 1.5078559 | 1.2706843 | NA |
| 0 | OXA-48_ic50_CAZtraj3 | 0.0000000 | -0.0026570 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | NA | NA | NA |
| 1 | OXA-48_ic50_CAZtraj3 | 0.4329693 | 0.4292205 | 0.4329693 | 0.4329693 | 0.4329693 | 0.4329693 | NA | NA | NA |
| 2 | OXA-48_ic50_CAZtraj3 | 0.9938769 | 0.9737267 | 0.4898741 | 0.9938769 | 0.9938769 | 0.9938769 | NA | NA | NA |
| 3 | OXA-48_ic50_CAZtraj3 | 1.3364597 | 1.3340956 | 0.4941955 | 1.1216466 | 1.3364597 | 1.3364597 | NA | NA | NA |
| 4 | OXA-48_ic50_CAZtraj3 | 1.4149733 | 1.4081219 | 0.5355882 | 0.9184096 | 1.6022940 | 1.4149733 | NA | NA | NA |
| 0 | PTE_catact_2NH | 0.0000000 | -0.0030856 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | NA | NA | NA |
| 1 | PTE_catact_2NH | 1.4199557 | 1.4175792 | 1.4199557 | 1.4199557 | 1.4199557 | 1.4199557 | NA | NA | NA |
| 2 | PTE_catact_2NH | 2.2095150 | 2.2072984 | 2.2408137 | 2.2095150 | 2.2095150 | 2.2095150 | NA | NA | NA |
| 3 | PTE_catact_2NH | 3.0211893 | 3.0194333 | 2.8546556 | 4.0243530 | 3.0211893 | 3.0211893 | NA | NA | NA |
| 4 | PTE_catact_2NH | 3.2405492 | 3.2383995 | 2.6608355 | 4.1859390 | 2.3618565 | 3.2405492 | NA | NA | NA |
| 0 | TEM_MIC_weinreich | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | 0.0000000 | NA | NA |
| 1 | TEM_MIC_weinreich | 1.2013971 | 1.2016454 | 1.2013971 | 1.2013971 | 1.2013971 | 1.2013971 | 1.2013971 | NA | NA |
| 2 | TEM_MIC_weinreich | 3.6117233 | 3.6118198 | 1.3803741 | 3.6117233 | 3.6117233 | 3.6117233 | 3.6117233 | NA | NA |
| 3 | TEM_MIC_weinreich | 4.3783979 | 4.3777366 | 1.3803741 | 5.8493869 | 4.3783979 | 4.3783979 | 4.3783979 | NA | NA |
| 4 | TEM_MIC_weinreich | 4.5185139 | 4.5179153 | 1.2893949 | 8.6829825 | 2.5918364 | 4.5185139 | 4.5185139 | NA | NA |
| 5 | TEM_MIC_weinreich | 4.6683859 | 4.6683012 | 1.2893949 | 8.9067386 | 3.4005435 | 4.3989125 | 4.6683859 | NA | NA |
| 0 | DHFR_ic75_palmer | 0.0000000 | -0.0177247 | 0.0000000 | 0.0000000 | 0.0000000 | NA | NA | NA | NA |
| 1 | DHFR_ic75_palmer | 1.8555192 | 1.8548586 | 1.8555192 | 1.8555192 | 1.8555192 | NA | NA | NA | NA |
| 2 | DHFR_ic75_palmer | 2.1931246 | 2.1911919 | 2.1630152 | 2.1931246 | 2.1931246 | NA | NA | NA | NA |
| 3 | DHFR_ic75_palmer | 2.3424227 | 2.3418586 | 2.8064679 | 2.4667779 | 2.3424227 | NA | NA | NA | NA |
Then we make a figure tracking error (visual of supplementary table 6)
plotting_d_supp_4 = plotting_d %>%
mutate(name = str_replace(name, "WT effect", "1 step")) %>%
filter(name != "epistatic effect") %>%
filter(name != "final step") %>%
pivot_wider(names_from = "name",
values_from = "value") %>%
pivot_longer(cols = `1 step`:`6 step`) %>%
drop_na() %>%
mutate(value = abs(value - `observed effect`)) %>%
filter(mutations > as.numeric(str_sub(name, 1, 1)))
supp_4_plot = plotting_d_supp_4 %>%
mutate(`Model Order` = str_replace(name, " step", "th Order")) %>%
ggplot(aes(x = mutations, y = value, color = `Model Order`)) +
geom_hline(yintercept = 0) +
geom_hline(yintercept = 0 + log10(1.5), lty = 2) +
geom_point() +
geom_line(lwd = 0.5) +
facet_wrap(~ trajectory) +
#geom_hline(data = dummy2, aes(yintercept = Z - log10(1.5)), lty = 2) +
#geom_hline(data = dummy2, aes(yintercept = Z + log10(10)), lty = 2, color = "grey") +
#geom_hline(data = dummy2, aes(yintercept = Z - log10(10)), lty = 2, color = "grey") +
theme_classic() +
labs(x = "Predicted mutant order",
y = expression("Absolute error of predicted"*italic(" F"))) +
theme_classic() +
theme(axis.line = element_line(size = 0.2, color = "black"), axis.ticks = element_line(size = 0.2, color = "black"), text = element_text(size = 9), axis.text = element_text(size = 8, color = "black"))
#ggsave("fig_3D.svg", supp_4_plot, width = 180, height = 247/3, dpi = 300, units = "mm")
I can make a ruggedness plot for each adaptive landscape, where deviation from 0 implies epistatic contribution of that genotype
adaptive_traj %>%
filter(mutations > 1 & likely) %>%
ggplot(aes(x = mutations, y = avg)) +
geom_line() +
geom_text_repel(aes(label = genotype), force = 1.5, direction = "y", box.padding = 0, segment.size = 0.2, size = 2.5) +
geom_hline(yintercept = log10(1.5), lty = 2) +
geom_hline(yintercept = log10(1/1.5), lty = 2) +
geom_hline(yintercept = 0) +
facet_wrap(~ unique_id) +
theme_classic()
What if we compare it to the mean at that combination or position as well, not just assumed additivity (0 +/- log10(1.5))?
higher_means = higher_df %>%
unite("partial_id", c(Enzyme, Measurement, Condition), remove = F) %>%
filter(partial_id %in% adaptive_trajectories) %>% ## use partial ID to only pick up the adaptive trajectories
group_by(unique_id) %>%
mutate(avg = mean(avg)) %>% ## Is arithmetic mean OK here?
ungroup() %>%
distinct(unique_id, .keep_all = T) ## Remove unique_id replicates
adaptive_means = higher_means %>% arrange(partial_id, pos) %>% pull(avg)
adaptive_traj = adaptive_traj %>% arrange(unique_id, pos) %>% mutate(means = adaptive_means)
Then we make the plot
trajectory_labeller = function(variable,value){
return(trajectory_names[value])
}
trajectory_names = list(
"DHFR_ic75_palmer" = expression("DHFR Palmer"*italic(" et al.")),
"DHFR_ki_trajg" = expression("DHFR Traj. G"),
"DHFR_ki_trajr" = expression("DHFR Traj. R"),
"MPH_catact_ZnPTM" = expression("MPH Traj. Zn"),
"NfsA_ec50_2039" = expression("NfsA Traj. 20_39"),
"NfsA_ec50_3637" = expression("NfsA Traj. 36_37"),
"OXA-48_ic50_CAZtraj1" = expression("OXA-48 CAZ Traj. 1"),
"OXA-48_ic50_CAZtraj2" = expression("OXA-48 CAZ Traj. 2"),
"OXA-48_ic50_CAZtraj3" = expression("OXA-48 CAZ Traj. 3"),
"PTE_catact_2NH" = expression("PTE Traj. 2NH"),
"TEM_MIC_weinreich" = expression("TEM Weinreich"*italic(" et al."))
)
adaptive_traj_plot = adaptive_traj %>%
filter(likely & mutations > 1) %>%
mutate(wt_bg = avg) %>%
pivot_longer(c(wt_bg, means)) %>%
ggplot(aes(x = factor(mutations), y = value, fill = name)) +
geom_bar(stat = "identity", position = position_dodge()) +
geom_hline(yintercept = log10(1.5), lty = 2, size = 0.2) +
geom_hline(yintercept = log10(1/1.5), lty = 2, size = 0.2) +
geom_hline(yintercept = 0) +
scale_color_manual(values = c("#edae49", "#4988ed")) +
facet_wrap(~ unique_id,
scales = "free",
labeller = trajectory_labeller) +
labs(x = "Mutational Step",
y = expression(epsilon)) +
theme_classic() +
theme(text = element_text(size=9), axis.text = element_text(size=8, color = "black"),
legend.position = "none", axis.line=element_line()) +
scale_fill_manual(values = c("#4988ed", "#edae49")) +
scale_y_continuous(limits=c(-3,3))
## Warning: The `labeller` API has been updated. Labellers taking `variable` and `value`
## arguments are now deprecated.
## ℹ See labellers documentation.
adaptive_traj_plot
#ggsave("fig_4.svg", adaptive_traj_plot, width = 180, height = 247/2, dpi = 300, units = "mm")
Which one of the genotypes seen above are the outliers?
adaptive_traj %>%
filter(likely & mutations > 1) %>%
mutate(outlier = abs(means - avg) > log10(1.5)) %>%
select(c(genotype, unique_id)) %>%
knitr::kable()
| genotype | unique_id |
|---|---|
| x0xx00 | DHFR_ic75_palmer |
| 00xx00 | DHFR_ic75_palmer |
| xxxxx | DHFR_ki_trajg |
| 0xxxx | DHFR_ki_trajg |
| 0xx0x | DHFR_ki_trajg |
| 00x0x | DHFR_ki_trajg |
| xxxxx | DHFR_ki_trajr |
| xxx0x | DHFR_ki_trajr |
| 0xx0x | DHFR_ki_trajr |
| 00x0x | DHFR_ki_trajr |
| 0xx00 | MPH_catact_ZnPTM |
| 0xx0x | MPH_catact_ZnPTM |
| xxxxx | MPH_catact_ZnPTM |
| xxx0x | MPH_catact_ZnPTM |
| xxx00xx | NfsA_ec50_2039 |
| x0x00xx | NfsA_ec50_2039 |
| x0000x0 | NfsA_ec50_2039 |
| x0000xx | NfsA_ec50_2039 |
| xxxxxxx | NfsA_ec50_3637 |
| xxx0xxx | NfsA_ec50_3637 |
| x0x0xxx | NfsA_ec50_3637 |
| x0x00x0 | NfsA_ec50_3637 |
| x0x00xx | NfsA_ec50_3637 |
| x0000x0 | NfsA_ec50_3637 |
| xxxx | OXA-48_ic50_CAZtraj1 |
| 0xxx | OXA-48_ic50_CAZtraj1 |
| 0x0x | OXA-48_ic50_CAZtraj1 |
| 00x0x0 | OXA-48_ic50_CAZtraj2 |
| 00x0xx | OXA-48_ic50_CAZtraj2 |
| xxxxxx | OXA-48_ic50_CAZtraj2 |
| 0xxxxx | OXA-48_ic50_CAZtraj2 |
| 0xx0xx | OXA-48_ic50_CAZtraj2 |
| xx0x00 | OXA-48_ic50_CAZtraj3 |
| xx0x0x | OXA-48_ic50_CAZtraj3 |
| x00x00 | OXA-48_ic50_CAZtraj3 |
| xxx0x0 | PTE_catact_2NH |
| xx00x0 | PTE_catact_2NH |
| 0x00x0 | PTE_catact_2NH |
| 00x0x | TEM_MIC_weinreich |
| xxxxx | TEM_MIC_weinreich |
| 0xxxx | TEM_MIC_weinreich |
| 0xx0x | TEM_MIC_weinreich |
Now we swich gears to look at the entire adaptive landscapes, not just the most accessible trajectory.
What % of adaptive landscape genotypes are outliers from the mean? What % of adaptive accessible trajectory genotypes are outliers from the mean? What % of adaptive less accessible trajectory genotypes are outliers from the mean?
adaptive_traj_final = adaptive_traj %>%
mutate(outlier = abs(means - avg) > log10(1.5))
length(adaptive_traj_final$outlier)
## [1] 645
sum(adaptive_traj_final$outlier)
## [1] 309
sum(adaptive_traj_final$outlier) / length(adaptive_traj_final$outlier)
## [1] 0.4790698
length(filter(adaptive_traj_final, likely) %>% pull(outlier))
## [1] 53
sum(filter(adaptive_traj_final, likely) %>% pull(outlier))
## [1] 30
sum(filter(adaptive_traj_final, likely) %>% pull(outlier)) / length(filter(adaptive_traj_final, likely) %>% pull(outlier))
## [1] 0.5660377
length(filter(adaptive_traj_final, !likely) %>% pull(outlier))
## [1] 592
sum(filter(adaptive_traj_final, !likely) %>% pull(outlier))
## [1] 279
sum(filter(adaptive_traj_final, !likely) %>% pull(outlier)) / length(filter(adaptive_traj_final, !likely) %>% pull(outlier))
## [1] 0.4712838
Next we try to see if adaptive vs random trajectories show diminishing returns.
Doing for single mutants first for every single landscape…
higher_df_diminish = higher_df %>%
unite("partial_id", c(Enzyme, Measurement, Condition)) %>%
filter(mutations == 1) %>%
mutate(from = str_replace_all(genotype, "x", "0"),
to = str_replace_all(genotype, "x", "1"))
start_effect = c()
for(i in 1:dim(higher_df_diminish)[1]) {
if(!str_detect(higher_df_diminish$from[i], "1")) {
start_effect = c(start_effect, 0) # If starts from WT, start effect is 0
} else {
start_effect = c(start_effect, filter(fit_land, unique_id == higher_df_diminish$partial_id[i] & id == higher_df_diminish$from[i])
%>% pull(effect))
}
}
higher_df_diminish$start_effect = start_effect
ggplot(higher_df_diminish, aes(x = start_effect, y = avg)) +
geom_point() +
theme_classic()
Next only looking at adaptive landscapes
higher_df_diminish = higher_df %>%
unite("partial_id", c(Enzyme, Measurement, Condition)) %>%
filter(mutations == 1) %>%
mutate(from = str_replace_all(genotype, "x", "0"),
to = str_replace_all(genotype, "x", "1")) %>%
filter(partial_id %in% adaptive_trajectories)
start_effect = c()
for(i in 1:dim(higher_df_diminish)[1]) {
if(!str_detect(higher_df_diminish$from[i], "1")) {
start_effect = c(start_effect, 0) # If starts from WT, start effect is 0
} else {
start_effect = c(start_effect, filter(fit_land, unique_id == higher_df_diminish$partial_id[i] & id == higher_df_diminish$from[i])
%>% pull(effect))
}
}
higher_df_diminish$start_effect = start_effect
ggplot(higher_df_diminish, aes(x = start_effect, y = avg)) +
geom_point() +
facet_wrap(~ partial_id) +
geom_smooth(method = 'lm', formula = y ~ x) +
stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~~")), color = "red", geom = "label", label.x = -2.5, label.y = 4, r.accuracy = 0.01, p.accuracy = 0.01) +
theme_classic()
## Warning: The dot-dot notation (`..rr.label..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(rr.label)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
What about only looking at the best step from each starting genotype?
higher_df_diminish %>%
group_by(partial_id, from) %>%
summarise(avg = max(avg), start_effect = start_effect) %>%
distinct() %>%
ggplot(aes(x = start_effect, y = avg)) +
geom_point() +
geom_smooth(method = 'lm', formula = y ~ x) +
stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~~")), color = "red", geom = "label", label.x = -2, label.y = 4.7, r.accuracy = 0.01, p.accuracy = 0.01) +
facet_wrap(~ partial_id) +
theme_classic()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'partial_id', 'from'. You can override
## using the `.groups` argument.
What about only looking at the adaptive trajectories?
all_traj_points = all_traj %>%
filter(likely & unique_id %in% adaptive_trajectories) %>%
mutate(genotype = str_replace_all(genotype, "x", "1")) %>%
select(genotype, unique_id)
higher_df_diminish_traj = tibble()
prev_geno = c()
for(i in 1:dim(all_traj_points)[1]) {
if(length(prev_geno) > 0) {
higher_df_diminish_traj = rbind(higher_df_diminish_traj, higher_df_diminish %>% filter(partial_id == all_traj_points$unique_id[i] & to == all_traj_points$genotype[i]) %>% filter(from == prev_geno))
} else {
higher_df_diminish_traj = rbind(higher_df_diminish_traj, higher_df_diminish %>% filter(partial_id == all_traj_points$unique_id[i] & to == all_traj_points$genotype[i]))
}
if(i != dim(all_traj_points)[1]) {
if(all_traj_points$unique_id[i + 1] == all_traj_points$unique_id[i]) {
prev_geno = tail(higher_df_diminish_traj$to, 1)
} else {
prev_geno = c()
}
}
}
higher_df_diminish_traj %>%
ggplot(aes(x = start_effect, y = avg)) +
geom_point() +
geom_smooth(method = 'lm', formula = y ~ x) +
stat_cor(aes(label = paste(..rr.label.., ..p.label.., sep = "~~")), color = "red", geom = "label", label.x = 0, label.y = 3.7, r.accuracy = 0.01, p.accuracy = 0.01) +
facet_wrap(~ partial_id) +
theme_classic()
We can also do a side by side correlation comparison of starting effect vs MAXIMUM delFs as well as vs ALL POSSIBLE delFs
max_list = higher_df_diminish %>%
group_by(partial_id, from) %>%
summarise(avg = max(avg), start_effect = start_effect) %>%
distinct()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
## always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'partial_id', 'from'. You can override
## using the `.groups` argument.
maxes = c()
for(entry in 1:dim(higher_df_diminish)[1]) {
maxes = c(maxes, ifelse(any(higher_df_diminish[entry,]$from == max_list$from & higher_df_diminish[entry,]$partial_id == max_list$partial_id & higher_df_diminish[entry,]$avg == max_list$avg & higher_df_diminish[entry,]$start_effect == max_list$start_effect), T, F))
}
higher_df_diminish$is_max = maxes
higher_df_diminish_plot = higher_df_diminish %>%
ggplot(aes(x = start_effect, y = avg)) +
geom_point(aes(fill = is_max), shape = 21, color = "black", stroke = 0.1) +
geom_smooth(method = 'lm', formula = y ~ x, color = "grey") +
geom_smooth(data = higher_df_diminish %>% filter(is_max), method = 'lm', formula = y ~ x, color = "#edae49") +
stat_cor(aes(label = paste(..rr.label.., sep = "~~")), label.x = 2, label.y = 5.5, r.accuracy = 0.01, size = 2.5, color = "grey") +
stat_cor(data = higher_df_diminish %>% filter(is_max), aes(label = paste(..rr.label.., sep = "~~")), label.x = 2, label.y = 4, r.accuracy = 0.01, size = 2.5, color = "#edae49") +
facet_wrap(~ partial_id, scales = "free") +
labs(x = expression("Starting Function ("*italic("F")*")"),
y = expression(Delta*'F')) +
theme_classic() +
theme(text = element_text(size=9), axis.text = element_text(size=8, color = "black"),
legend.position = "none", axis.line=element_line()) +
scale_fill_manual(values = c("grey", "#edae49")) +
scale_x_continuous(limits=c(-6,6)) +
scale_y_continuous(limits=c(-6,6))
higher_df_diminish_plot
#ggsave("supp_fig_3.svg", higher_df_diminish_plot, width = 180, height = 247/2, dpi = 300, units = "mm")
Modify adaptive traj to also have the genotypic effect from fit_land. To do this I need to take the unique_id from adaptive traj and the genotype after x -> 1 modification from adaptive traj, and find the corresponding value of the genotype in fit_land
traj_effects = c()
for(i in 1:dim(adaptive_traj)[1]){
traj_effects= c(traj_effects, fit_land %>%
filter(unique_id == adaptive_traj$unique_id[i]) %>%
filter(id == str_replace_all(adaptive_traj$genotype[i], "x", "1")) %>%
pull(effect))
}
adaptive_traj$traj_effects = traj_effects
Now that adaptive_traj has genotype effects, we can ask the question: Did the idiosyncratic epistasis at that order specifically enable or restrict that genotype.
access = c()
access_no_idio = c()
access_mean = c()
for(i in 1:dim(adaptive_traj)[1]) {
current_test = adaptive_traj[i,]
if(current_test$mutations != 1) {
loc = str_locate_all(current_test$genotype, "x")[[1]][,1]
len = str_length(current_test$genotype)
func = c()
# Get all previous constituents
for(j in 1:length(loc)) {
const = current_test$genotype
substr(const, loc[j], loc[j]) = "0"
func = c(func, adaptive_traj %>% filter(unique_id == current_test$unique_id & genotype == const) %>% pull(traj_effects))
}
access = c(access, sum(current_test$traj_effects - func >= 0))
access_no_idio = c(access_no_idio, sum((current_test$traj_effects - current_test$avg) - func >= 0)) # Access without epistasis
access_mean = c(access_mean, sum((current_test$traj_effects - (current_test$avg - current_test$means)) - func >= 0))
} else {
access = c(access, NA)
access_no_idio = c(access_no_idio, NA)
access_mean = c(access_mean, NA)
}
}
Now let’s look at the accessibility metrics in a tibble
access_df = tibble(access = access,
access_no_idio = access_no_idio,
access_mean = access_mean)
access_df = access_df %>% mutate(access_w_mean = access_mean - access,
access_w_idio = access - access_no_idio)
adaptive_traj_access = cbind(adaptive_traj, access_df)
head(adaptive_traj_access)
## genotype avg pos mutations likely unique_id
## 1 x00000 0.64345268 p1 1 FALSE DHFR_ic75_palmer
## 2 xx0000 0.01200891 p1|p21 2 FALSE DHFR_ic75_palmer
## 3 xxx000 4.09150055 p1|p21|p26 3 FALSE DHFR_ic75_palmer
## 4 xxxx00 -3.44007396 p1|p21|p26|p28 4 FALSE DHFR_ic75_palmer
## 5 xxxxx0 0.21273191 p1|p21|p26|p28|p30 5 FALSE DHFR_ic75_palmer
## 6 xxxxxx -7.18913559 p1|p21|p26|p28|p30|p94 6 FALSE DHFR_ic75_palmer
## enzyme_name measure_type cond means traj_effects access
## 1 DHFR ic75 palmer 1.20623549 0.6434527 NA
## 2 DHFR ic75 palmer 0.82391077 1.3838154 2
## 3 DHFR ic75 palmer 0.21721618 1.9106244 3
## 4 DHFR ic75 palmer 0.08693999 2.3242825 3
## 5 DHFR ic75 palmer -3.38183588 2.1038037 0
## 6 DHFR ic75 palmer -7.18913559 2.1583625 2
## access_no_idio access_mean access_w_mean access_w_idio
## 1 NA NA NA NA
## 2 2 2 0 0
## 3 1 1 -2 2
## 4 4 4 1 -1
## 5 0 0 0 0
## 6 6 2 0 -4
What does accessibility look like in adaptive trajectories only?
head(adaptive_traj_access %>%
filter(likely))
## genotype avg pos mutations likely unique_id
## 1 x0xx00 -0.12435525 p1|p26|p28 3 TRUE DHFR_ic75_palmer
## 2 00xx00 0.03010940 p26|p28 2 TRUE DHFR_ic75_palmer
## 3 000x00 1.85551916 p28 1 TRUE DHFR_ic75_palmer
## 4 xxxxx -0.40240226 p21|p26|p28|p30|p94 5 TRUE DHFR_ki_trajg
## 5 0xxxx 0.36121760 p26|p28|p30|p94 4 TRUE DHFR_ki_trajg
## 6 0xx0x -0.02415083 p26|p28|p94 3 TRUE DHFR_ki_trajg
## enzyme_name measure_type cond means traj_effects access
## 1 DHFR ic75 palmer -0.37628762 2.342423 3
## 2 DHFR ic75 palmer 0.51316879 2.193125 2
## 3 DHFR ic75 palmer 0.02486007 1.855519 NA
## 4 DHFR ki trajg -0.40240226 5.965672 5
## 5 DHFR ki trajg 0.16001647 4.582063 4
## 6 DHFR ki trajg -0.03088869 3.354108 3
## access_no_idio access_mean access_w_mean access_w_idio
## 1 3 2 -1 0
## 2 2 2 0 0
## 3 NA NA NA NA
## 4 5 5 0 0
## 5 4 4 0 0
## 6 3 3 0 0
And again for non-adaptive trajectories
head(adaptive_traj_access %>%
filter(!likely))
## genotype avg pos mutations likely unique_id
## 1 x00000 0.64345268 p1 1 FALSE DHFR_ic75_palmer
## 2 xx0000 0.01200891 p1|p21 2 FALSE DHFR_ic75_palmer
## 3 xxx000 4.09150055 p1|p21|p26 3 FALSE DHFR_ic75_palmer
## 4 xxxx00 -3.44007396 p1|p21|p26|p28 4 FALSE DHFR_ic75_palmer
## 5 xxxxx0 0.21273191 p1|p21|p26|p28|p30 5 FALSE DHFR_ic75_palmer
## 6 xxxxxx -7.18913559 p1|p21|p26|p28|p30|p94 6 FALSE DHFR_ic75_palmer
## enzyme_name measure_type cond means traj_effects access
## 1 DHFR ic75 palmer 1.20623549 0.6434527 NA
## 2 DHFR ic75 palmer 0.82391077 1.3838154 2
## 3 DHFR ic75 palmer 0.21721618 1.9106244 3
## 4 DHFR ic75 palmer 0.08693999 2.3242825 3
## 5 DHFR ic75 palmer -3.38183588 2.1038037 0
## 6 DHFR ic75 palmer -7.18913559 2.1583625 2
## access_no_idio access_mean access_w_mean access_w_idio
## 1 NA NA NA NA
## 2 2 2 0 0
## 3 1 1 -2 2
## 4 4 4 1 -1
## 5 0 0 0 0
## 6 6 2 0 -4
Now I can ask a variety of questions with this dataset. For example, lets see where epistasis changes node visitation
dim(adaptive_traj_access %>%
drop_na() %>%
filter(access != access_no_idio))[1] / dim(adaptive_traj_access %>% drop_na())[1]
## [1] 0.4939966
paste0(dim(adaptive_traj_access %>%
drop_na() %>%
filter(access != access_no_idio))[1], "/", dim(adaptive_traj_access %>% drop_na())[1])
## [1] "288/583"
How often does epistasis changes node visitation for most likely trajectories
dim(adaptive_traj_access %>%
filter(likely) %>%
drop_na() %>%
filter(access != access_no_idio))[1] / dim(adaptive_traj_access %>% filter(likely) %>% drop_na())[1]
## [1] 0.2619048
paste0(dim(adaptive_traj_access %>%
filter(likely) %>%
drop_na() %>%
filter(access != access_no_idio))[1], "/", dim(adaptive_traj_access %>% filter(likely) %>% drop_na())[1])
## [1] "11/42"
Where is idiosyncratic visitation different from the mean WHEN epistasis affects node visitation?
dim(adaptive_traj_access %>%
drop_na() %>%
filter(access != access_mean & access != access_no_idio))[1] / dim(adaptive_traj_access %>% filter(access != access_no_idio) %>% drop_na())[1]
## [1] 0.7326389
paste0(dim(adaptive_traj_access %>%
drop_na() %>%
filter(access != access_mean & access != access_no_idio))[1], "/", dim(adaptive_traj_access %>% filter(access != access_no_idio) %>% drop_na())[1])
## [1] "211/288"
Where is idiosyncratic visitation different from the mean in the likely traj?
dim(adaptive_traj_access %>%
filter(likely) %>%
drop_na() %>%
filter(access != access_mean))[1] / dim(adaptive_traj_access %>% filter(likely) %>% drop_na())[1]
## [1] 0.1904762
paste0(dim(adaptive_traj_access %>%
filter(likely) %>%
drop_na() %>%
filter(access != access_mean))[1], "/", dim(adaptive_traj_access %>% filter(likely) %>% drop_na())[1])
## [1] "8/42"
Restrictive idiosyncrasy: When negative epistasis prevents node visitation AND mean epistasis would have allowed for it
head(adaptive_traj_access %>%
drop_na() %>%
filter(avg < log10(1/1.5) & access_w_idio < 0 & access_w_mean > 0 & means > 0))
## genotype avg pos mutations likely unique_id
## 1 xxxx00 -3.4400740 p1|p21|p26|p28 4 FALSE DHFR_ic75_palmer
## 2 x0xx0x -1.1042305 p1|p26|p28|p94 4 FALSE DHFR_ic75_palmer
## 3 0xx000 -4.0673669 p21|p26 2 FALSE DHFR_ic75_palmer
## 4 0x0xx0 -1.6714200 p21|p28|p30 3 FALSE DHFR_ic75_palmer
## 5 0x00x0 -0.2888077 p21|p30 2 FALSE DHFR_ic75_palmer
## 6 0x00xx -1.5696224 p21|p30|p94 3 FALSE DHFR_ic75_palmer
## enzyme_name measure_type cond means traj_effects access access_no_idio
## 1 DHFR ic75 palmer 0.08693999 2.3242825 3 4
## 2 DHFR ic75 palmer 2.57320556 2.3263359 3 4
## 3 DHFR ic75 palmer 0.53515465 -3.0315171 0 2
## 4 DHFR ic75 palmer 0.76025803 -0.8041003 0 1
## 5 DHFR ic75 palmer 0.29888333 0.7067178 1 2
## 6 DHFR ic75 palmer 0.27945161 -0.5436340 0 2
## access_mean access_w_mean access_w_idio
## 1 4 1 -1
## 2 4 1 -1
## 3 2 2 -2
## 4 2 2 -1
## 5 2 1 -1
## 6 2 2 -2
What about positive epistasis that does not affect or increases visitation?
head(adaptive_traj_access %>%
drop_na() %>%
filter(avg > log10(1.5) & access_w_idio >= 0))
## genotype avg pos mutations likely unique_id
## 1 xxx000 4.0915006 p1|p21|p26 3 FALSE DHFR_ic75_palmer
## 2 xxxxx0 0.2127319 p1|p21|p26|p28|p30 5 FALSE DHFR_ic75_palmer
## 3 xxxx0x 10.4358638 p1|p21|p26|p28|p94 5 FALSE DHFR_ic75_palmer
## 4 xxx0xx 2.5755328 p1|p21|p26|p30|p94 5 FALSE DHFR_ic75_palmer
## 5 xx0x00 0.7938784 p1|p21|p28 3 FALSE DHFR_ic75_palmer
## 6 xx0xx0 1.9958051 p1|p21|p28|p30 4 FALSE DHFR_ic75_palmer
## enzyme_name measure_type cond means traj_effects access
## 1 DHFR ic75 palmer 0.21721618 1.910624 3
## 2 DHFR ic75 palmer -3.38183588 2.103804 0
## 3 DHFR ic75 palmer 6.84129599 2.294466 3
## 4 DHFR ic75 palmer -1.01903501 2.313867 5
## 5 DHFR ic75 palmer 0.01045432 2.071882 3
## 6 DHFR ic75 palmer 1.86938029 2.184691 3
## access_no_idio access_mean access_w_mean access_w_idio
## 1 1 1 -2 2
## 2 0 0 0 0
## 3 0 1 -2 3
## 4 0 0 -5 5
## 5 1 1 -2 2
## 6 1 2 -1 2
head(adaptive_traj_access %>%
drop_na() %>%
filter(avg > log10(1.5) & access_w_idio > 0))
## genotype avg pos mutations likely unique_id
## 1 xxx000 4.0915006 p1|p21|p26 3 FALSE DHFR_ic75_palmer
## 2 xxxx0x 10.4358638 p1|p21|p26|p28|p94 5 FALSE DHFR_ic75_palmer
## 3 xxx0xx 2.5755328 p1|p21|p26|p30|p94 5 FALSE DHFR_ic75_palmer
## 4 xx0x00 0.7938784 p1|p21|p28 3 FALSE DHFR_ic75_palmer
## 5 xx0xx0 1.9958051 p1|p21|p28|p30 4 FALSE DHFR_ic75_palmer
## 6 xx0xxx 3.1289862 p1|p21|p28|p30|p94 5 FALSE DHFR_ic75_palmer
## enzyme_name measure_type cond means traj_effects access
## 1 DHFR ic75 palmer 0.21721618 1.910624 3
## 2 DHFR ic75 palmer 6.84129599 2.294466 3
## 3 DHFR ic75 palmer -1.01903501 2.313867 5
## 4 DHFR ic75 palmer 0.01045432 2.071882 3
## 5 DHFR ic75 palmer 1.86938029 2.184691 3
## 6 DHFR ic75 palmer -0.46558155 2.278754 4
## access_no_idio access_mean access_w_mean access_w_idio
## 1 1 1 -2 2
## 2 0 1 -2 3
## 3 0 0 -5 5
## 4 1 1 -2 2
## 5 1 2 -1 2
## 6 2 2 -2 2
Permissive idiosyncrasy: When positive epistasis enables node visitation AND mean epistasis would have prevented it
head(adaptive_traj_access %>%
drop_na() %>%
filter(avg > log10(1.5) & access_w_idio > 0 & access_w_mean < 0 & means < 0))
## genotype avg pos mutations likely unique_id
## 1 xxx0xx 2.5755328 p1|p21|p26|p30|p94 5 FALSE DHFR_ic75_palmer
## 2 xx0xxx 3.1289862 p1|p21|p28|p30|p94 5 FALSE DHFR_ic75_palmer
## 3 xx000x 1.0635938 p1|p21|p94 3 FALSE DHFR_ic75_palmer
## 4 x0xxx0 0.2566377 p1|p26|p28|p30 4 FALSE DHFR_ic75_palmer
## 5 x0xxxx 0.5135761 p1|p26|p28|p30|p94 5 FALSE DHFR_ic75_palmer
## 6 x0x0xx 0.2071172 p1|p26|p30|p94 4 FALSE DHFR_ic75_palmer
## enzyme_name measure_type cond means traj_effects access
## 1 DHFR ic75 palmer -1.01903501 2.313867 5
## 2 DHFR ic75 palmer -0.46558155 2.278754 4
## 3 DHFR ic75 palmer -0.37347938 1.983175 3
## 4 DHFR ic75 palmer -1.17749221 2.322219 3
## 5 DHFR ic75 palmer -3.08099171 2.324282 4
## 6 DHFR ic75 palmer -0.04561225 2.217484 4
## access_no_idio access_mean access_w_mean access_w_idio
## 1 0 0 -5 5
## 2 2 2 -2 2
## 3 1 0 -3 2
## 4 0 0 -3 3
## 5 1 0 -4 3
## 6 1 1 -3 3
Even more interesting are the nodes with are either ENTIRELY accessible or inaccessible due to epistasis
head(adaptive_traj_access %>%
drop_na() %>%
filter(avg > log10(1.5) & access_no_idio == 0 & access > 0))
## genotype avg pos mutations likely unique_id
## 1 xxxx0x 10.4358638 p1|p21|p26|p28|p94 5 FALSE DHFR_ic75_palmer
## 2 xxx0xx 2.5755328 p1|p21|p26|p30|p94 5 FALSE DHFR_ic75_palmer
## 3 x0xxx0 0.2566377 p1|p26|p28|p30 4 FALSE DHFR_ic75_palmer
## 4 x00x0x 1.0706822 p1|p28|p94 3 FALSE DHFR_ic75_palmer
## 5 0xx00x 5.0316956 p21|p26|p94 3 FALSE DHFR_ic75_palmer
## 6 x0xxx 1.8737920 p21|p28|p30|p94 4 FALSE DHFR_ki_trajg
## enzyme_name measure_type cond means traj_effects access access_no_idio
## 1 DHFR ic75 palmer 6.841296 2.294466 3 0
## 2 DHFR ic75 palmer -1.019035 2.313867 5 0
## 3 DHFR ic75 palmer -1.177492 2.322219 3 0
## 4 DHFR ic75 palmer 1.647842 2.283301 3 0
## 5 DHFR ic75 palmer 1.695299 1.363612 3 0
## 6 DHFR ki trajg 1.672591 4.198657 4 0
## access_mean access_w_mean access_w_idio
## 1 1 -2 3
## 2 0 -5 5
## 3 0 -3 3
## 4 3 0 3
## 5 1 -2 3
## 6 4 0 4
head(adaptive_traj_access %>%
drop_na() %>%
filter(avg < log10(1/1.5) & access_no_idio > 0 & access == 0))
## genotype avg pos mutations likely unique_id
## 1 xx0x0x -5.214086 p1|p21|p28|p94 4 FALSE DHFR_ic75_palmer
## 2 0xx000 -4.067367 p21|p26 2 FALSE DHFR_ic75_palmer
## 3 0xxx0x -4.725753 p21|p26|p28|p94 4 FALSE DHFR_ic75_palmer
## 4 0x0xx0 -1.671420 p21|p28|p30 3 FALSE DHFR_ic75_palmer
## 5 0x00xx -1.569622 p21|p30|p94 3 FALSE DHFR_ic75_palmer
## 6 000xxx -2.043647 p28|p30|p94 3 FALSE DHFR_ic75_palmer
## enzyme_name measure_type cond means traj_effects access access_no_idio
## 1 DHFR ic75 palmer -0.2289452 -3.0315171 0 3
## 2 DHFR ic75 palmer 0.5351546 -3.0315171 0 2
## 3 DHFR ic75 palmer -0.8278136 -0.4225082 0 4
## 4 DHFR ic75 palmer 0.7602580 -0.8041003 0 1
## 5 DHFR ic75 palmer 0.2794516 -0.5436340 0 2
## 6 DHFR ic75 palmer 0.6800592 -0.1573908 0 2
## access_mean access_w_mean access_w_idio
## 1 1 1 -3
## 2 2 2 -2
## 3 4 4 -4
## 4 2 2 -1
## 5 2 2 -2
## 6 3 3 -2
Specific epistasis examples, starting with PTE
with_epi = fit_land %>%
filter(unique_id == "PTE_catact_2NH" & id == "111010") %>%
pull(effect)
epi = higher_df %>% filter(partial_id == "PTE_catact_2NH" & genotype == "xxx0x0") %>% pull(avg)
mean_epi = adaptive_traj %>% filter(unique_id == "PTE_catact_2NH" & genotype == "xxx0x0") %>% pull(means)
wo_epi = with_epi - epi
w_mean_epi = with_epi - epi + mean_epi
fig_4_new = fit_land %>%
filter(unique_id == "PTE_catact_2NH") %>%
filter(id == "000010" | id == "010010" | id == "110010" | id == "111010" | id == "111110" | id == "111111") %>%
add_row(id = "000000",
effect = 0,
muts = 0,
junk1 = NA,
junk2 = NA,
enz = "PTE",
unique_id = "PTE_catact_2NH",
.before = 1) %>%
add_row(id = "alt",
effect = wo_epi,
muts = 4,
junk1 = NA,
junk2 = NA,
enz = "PTE",
unique_id = "PTE_catact_2NH",
.before = 1) %>%
add_row(id = "alt2",
effect = w_mean_epi,
muts = 4,
junk1 = NA,
junk2 = NA,
enz = "PTE",
unique_id = "PTE_catact_2NH",
.before = 1) %>%
select(-c("junk1", "junk2")) %>%
mutate(muts = factor(muts)) %>%
ggplot(aes(x = muts, y = effect)) +
geom_point(shape = 1, size = 3) +
labs(x = "Step",
y = expression("log"[10]*" Function") ) +
theme_classic() +
theme(axis.line = element_line(size = 0.2, color = "black"),
axis.ticks = element_line(size = 0.2, color = "black"),
text = element_text(size = 9),
axis.text = element_text(size = 8, color = "black"))
fig_4_new
#ggsave("fig_4A_new.svg", fig_4_new, width = 180/2.5, height = 247/4, dpi = 300, units = "mm")
Next one is MPH
with_epi = fit_land %>%
filter(unique_id == "MPH_catact_ZnPTM" & id == "10001") %>%
pull(effect)
epi = higher_df %>% filter(partial_id == "MPH_catact_ZnPTM" & genotype == "x000x") %>% pull(avg)
mean_epi = adaptive_traj %>% filter(unique_id == "MPH_catact_ZnPTM" & genotype == "x000x") %>% pull(means)
wo_epi = with_epi - epi
w_mean_epi = with_epi - epi + mean_epi
fig_4b_new = fit_land %>%
filter(unique_id == "MPH_catact_ZnPTM") %>%
filter(id == "10000" | id == "10001" | id == "11001" | id == "11101" | id == "11111") %>%
add_row(id = "00000",
effect = 0,
muts = 0,
junk1 = NA,
junk2 = NA,
enz = "MPH",
unique_id = "MPH_catact_ZnPTM",
.before = 1) %>%
add_row(id = "alt",
effect = wo_epi,
muts = 2,
junk1 = NA,
junk2 = NA,
enz = "MPH",
unique_id = "MPH_catact_ZnPTM") %>%
add_row(id = "alt2",
effect = w_mean_epi,
muts = 2,
junk1 = NA,
junk2 = NA,
enz = "MPH",
unique_id = "MPH_catact_ZnPTM") %>%
select(-c("junk1", "junk2")) %>%
mutate(muts = factor(muts)) %>%
ggplot(aes(x = muts, y = effect)) +
geom_point(shape = 1, size = 3) +
labs(x = "Step",
y = expression("log"[10]*" Function") ) +
theme_classic() +
theme(axis.line = element_line(size = 0.2, color = "black"),
axis.ticks = element_line(size = 0.2, color = "black"),
text = element_text(size = 9),
axis.text = element_text(size = 8, color = "black"))
fig_4b_new
#ggsave("fig_4B_new.svg", fig_4b_new, width = 180/2.5, height = 247/4, dpi = 300, units = "mm")
Here we attempt to see whether the function of the genotype F is an indicator of the magnitude of epistasis
higher_df_start_to_epi = higher_df %>%
unite("partial_id", c(Enzyme, Measurement, Condition)) %>%
mutate(from = str_replace_all(genotype, "x", "0"),
to = str_replace_all(genotype, "x", "1"))
start_effect = c()
for(i in 1:dim(higher_df_start_to_epi)[1]) {
if(!str_detect(higher_df_start_to_epi$from[i], "1")) {
start_effect = c(start_effect, 0) # If starts from WT, start effect is 0
} else {
start_effect = c(start_effect, filter(fit_land, unique_id == higher_df_start_to_epi$partial_id[i] & id == higher_df_start_to_epi$from[i])
%>% pull(effect))
}
}
higher_df_start_to_epi$start_effect = start_effect
The higher_df_start_to_epi tibble contains information
regarding specific epistasis in the avg variable and
starting effect of the genotype in the start_effect. We can
plot the correlation for all landscapes
higher_df_start_to_epi %>%
filter(mutations > 1) %>%
ggplot(aes(x = start_effect, y = avg)) +
geom_point() +
theme_classic()
Then again for adaptive landscapes only
higher_df_start_to_epi %>%
filter(mutations > 1 & partial_id %in% adaptive_trajectories) %>%
ggplot(aes(x = start_effect, y = avg)) +
geom_point() +
theme_classic()
What if we facet adaptive landscapes by order?
higher_df_start_to_epi %>%
filter(mutations > 1 & partial_id %in% adaptive_trajectories) %>%
ggplot(aes(x = start_effect, y = avg)) +
geom_point() +
facet_wrap(~ mutations) +
theme_classic()
Due to the differences in strength of functional affects across landscapes, maybe its important to separate by landscape and see whether there is a difference between the correlation of mutation effects and starting function VS the correlation of epistasis and starting function
trajectory_names = c(
DHFR_ic75_palmer = "DHFR Palmer et al.",
DHFR_ki_trajg = "DHFR Traj. G",
DHFR_ki_trajr = "DHFR Traj. R",
MPH_catact_ZnPTM = "MPH Traj. Zn",
NfsA_ec50_2039 = "NfsA Traj. 20_39",
NfsA_ec50_3637 = "NfsA Traj. 36_37",
`OXA-48_ic50_CAZtraj1` = "OXA-48 CAZ Traj. 1",
`OXA-48_ic50_CAZtraj2` = "OXA-48 CAZ Traj. 2",
`OXA-48_ic50_CAZtraj3` = "OXA-48 CAZ Traj. 3",
PTE_catact_2NH = "PTE Traj. 2NH",
TEM_MIC_weinreich = "TEM Weinreich et al.")
global_labeller = labeller(
partial_id = trajectory_names
)
dimish_del_f = higher_df_start_to_epi %>%
filter(mutations == 1 & partial_id %in% adaptive_trajectories) %>%
ggplot(aes(x = start_effect, y = avg)) +
stat_cor(aes(label = paste(..r.label.., sep = "~~")), method = "spearman", cor.coef.name = "rho", label.x.npc = "middle",
label.y.npc = "top", r.accuracy = 0.01, size = 2.5, color = "black") +
geom_point() +
geom_smooth(method = 'loess', formula = y ~ x) +
facet_wrap(~ partial_id,
scales = "free",
labeller = global_labeller) +
theme_classic() +
labs(x = expression("Starting Function ("*italic("F")*")"),
y = expression(Delta*italic("F"))) +
theme(text = element_text(size=9), axis.text = element_text(size=8, color = "black"),
legend.position = "none", axis.line=element_line()) #+
# scale_y_continuous(limits=c(-5,5)) +
#scale_x_continuous(limits=c(-3,5))
dimish_epi = higher_df_start_to_epi %>%
filter(mutations > 1 & partial_id %in% adaptive_trajectories) %>%
ggplot(aes(x = start_effect, y = avg)) +
stat_cor(aes(label = paste(..r.label.., sep = "~~")), method = "spearman", cor.coef.name = "rho", label.x.npc = "middle",
label.y.npc = "top", r.accuracy = 0.01, size = 2.5, color = "black") +
geom_point() +
geom_smooth(method = 'loess', formula = y ~ x) +
facet_wrap(~ partial_id,
scales = "free",
labeller = global_labeller) +
theme_classic() +
labs(x = expression("Starting Function ("*italic("F")*")"),
y = expression(epsilon)) +
theme(text = element_text(size=9), axis.text = element_text(size=8, color = "black"),
legend.position = "none", axis.line=element_line())
dimish_del_f
dimish_epi
#ggsave("supp_fig_3A.svg", dimish_del_f, width = 180, height = 247/2, dpi = 300, units = "mm")
#ggsave("supp_fig_3B.svg", dimish_epi, width = 180, height = 247/2, dpi = 300, units = "mm")